diff --git a/.gitattributes b/.gitattributes
index 267d0e5d84cba76ed182c682bec3156e7d39352b..f7964fdfbf978e1b6199bab97b04e92bba0a111a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -101,3 +101,4 @@ MLPY/Lib/site-packages/torch/lib/torch_cpu.dll filter=lfs diff=lfs merge=lfs -te
 MLPY/Lib/site-packages/torch/lib/torch_cpu.lib filter=lfs diff=lfs merge=lfs -text
 MLPY/Lib/site-packages/torch/lib/torch_python.dll filter=lfs diff=lfs merge=lfs -text
 MLPY/Lib/site-packages/torch/lib/XNNPACK.lib filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torchaudio/lib/libtorchaudio.pyd filter=lfs diff=lfs merge=lfs -text
diff --git a/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/INSTALLER b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/INSTALLER
new file mode 100644
index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/LICENSE b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..1edfc4314e57dbc10f4e46e872106a62ca76bd23
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/LICENSE
@@ -0,0 +1,25 @@
+BSD 2-Clause License
+
+Copyright (c) 2017 Facebook Inc. (Soumith Chintala), 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/METADATA b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/METADATA
new file mode 100644
index 0000000000000000000000000000000000000000..3772f6bac0bb9db0a0066a165ce718fe72724b73
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/METADATA
@@ -0,0 +1,113 @@
+Metadata-Version: 2.1
+Name: torchaudio
+Version: 2.3.1
+Summary: An audio package for PyTorch
+Home-page: https://github.com/pytorch/audio
+Author: Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
+Author-email: soumith@pytorch.org
+Maintainer: Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
+Maintainer-email: moto@meta.com
+Classifier: Environment :: Plugins
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: Microsoft :: Windows
+Classifier: Operating System :: POSIX
+Classifier: Programming Language :: C++
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Topic :: Multimedia :: Sound/Audio
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch (==2.3.1)
+
+torchaudio: an audio library for PyTorch
+========================================
+
+[![Documentation](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchaudio%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/audio/main/)
+[![Anaconda Badge](https://anaconda.org/pytorch/torchaudio/badges/downloads.svg)](https://anaconda.org/pytorch/torchaudio)
+[![Anaconda-Server Badge](https://anaconda.org/pytorch/torchaudio/badges/platforms.svg)](https://anaconda.org/pytorch/torchaudio)
+
+![TorchAudio Logo](docs/source/_static/img/logo.png)
+
+The aim of torchaudio is to apply [PyTorch](https://github.com/pytorch/pytorch) to
+the audio domain. By supporting PyTorch, torchaudio follows the same philosophy
+of providing strong GPU acceleration, having a focus on trainable features through
+the autograd system, and having consistent style (tensor names and dimension names).
+Therefore, it is primarily a machine learning library and not a general signal
+processing library. The benefits of PyTorch can be seen in torchaudio through
+having all the computations be through PyTorch operations which makes it easy
+to use and feel like a natural extension.
+
+- [Support audio I/O (Load files, Save files)](http://pytorch.org/audio/main/)
+  - Load a variety of audio formats, such as `wav`, `mp3`, `ogg`, `flac`, `opus`, `sphere`, into a torch Tensor using SoX
+  - [Kaldi (ark/scp)](http://pytorch.org/audio/main/kaldi_io.html)
+- [Dataloaders for common audio datasets](http://pytorch.org/audio/main/datasets.html)
+- Audio and speech processing functions
+  - [forced_align](https://pytorch.org/audio/main/generated/torchaudio.functional.forced_align.html)
+- Common audio transforms
+  - [Spectrogram, AmplitudeToDB, MelScale, MelSpectrogram, MFCC, MuLawEncoding, MuLawDecoding, Resample](http://pytorch.org/audio/main/transforms.html)
+- Compliance interfaces: Run code using PyTorch that align with other libraries
+  - [Kaldi: spectrogram, fbank, mfcc](https://pytorch.org/audio/main/compliance.kaldi.html)
+
+Installation
+------------
+
+Please refer to https://pytorch.org/audio/main/installation.html for installation and build process of TorchAudio.
+
+
+API Reference
+-------------
+
+API Reference is located here: http://pytorch.org/audio/main/
+
+Contributing Guidelines
+-----------------------
+
+Please refer to [CONTRIBUTING.md](./CONTRIBUTING.md)
+
+Citation
+--------
+
+If you find this package useful, please cite as:
+
+```bibtex
+@article{yang2021torchaudio,
+  title={TorchAudio: Building Blocks for Audio and Speech Processing},
+  author={Yao-Yuan Yang and Moto Hira and Zhaoheng Ni and Anjali Chourdia and Artyom Astafurov and Caroline Chen and Ching-Feng Yeh and Christian Puhrsch and David Pollack and Dmitriy Genzel and Donny Greenberg and Edward Z. Yang and Jason Lian and Jay Mahadeokar and Jeff Hwang and Ji Chen and Peter Goldsborough and Prabhat Roy and Sean Narenthiran and Shinji Watanabe and Soumith Chintala and Vincent Quenneville-Bélair and Yangyang Shi},
+  journal={arXiv preprint arXiv:2110.15018},
+  year={2021}
+}
+```
+
+```bibtex
+@misc{hwang2023torchaudio,
+      title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch}, 
+      author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
+      year={2023},
+      eprint={2310.17864},
+      archivePrefix={arXiv},
+      primaryClass={eess.AS}
+}
+```
+
+Disclaimer on Datasets
+----------------------
+
+This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
+
+If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
+
+Pre-trained Model License
+-------------------------
+
+The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
+
+For instance, SquimSubjective model is released under the Creative Commons Attribution Non Commercial 4.0 International (CC-BY-NC 4.0) license. See [the link](https://zenodo.org/record/4660670#.ZBtWPOxuerN) for additional details.
+
+Other pre-trained models that have different license are noted in documentation. Please checkout the [documentation page](https://pytorch.org/audio/main/).
diff --git a/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/RECORD b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/RECORD
new file mode 100644
index 0000000000000000000000000000000000000000..8d6737945e98f1ab6aa21e7c4ac2fca37fdd78ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/RECORD
@@ -0,0 +1,277 @@
+torchaudio-2.3.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+torchaudio-2.3.1.dist-info/LICENSE,sha256=MmOOF5kxv-VR6r9nsOZ6E7SD4Wa1jdcmNjSrf4nzlvU,1363
+torchaudio-2.3.1.dist-info/METADATA,sha256=bn96zltasEUWCzMa6jcqOUW2YAvCuLLFKzwGjxbjxL0,6351
+torchaudio-2.3.1.dist-info/RECORD,,
+torchaudio-2.3.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+torchaudio-2.3.1.dist-info/WHEEL,sha256=fVcVlLzi8CGi_Ul8vjMdn8gER25dn5GBg9E6k9z41-Y,100
+torchaudio-2.3.1.dist-info/top_level.txt,sha256=GT0MktEbHKoLnvd-6ii7_dhJVvshupOujk840BcHU4U,17
+torchaudio/__init__.py,sha256=kUWnkwcKERMj7m-B4E826aGZWGa5sJs1DEpLP6LZoEs,945
+torchaudio/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/__pycache__/kaldi_io.cpython-39.pyc,,
+torchaudio/__pycache__/version.cpython-39.pyc,,
+torchaudio/_backend/__init__.py,sha256=2bMTZ3tG2_5nvnqAAmvEnGBInO5a9DdUPNoyXEnX1M0,1692
+torchaudio/_backend/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/_backend/__pycache__/backend.cpython-39.pyc,,
+torchaudio/_backend/__pycache__/common.cpython-39.pyc,,
+torchaudio/_backend/__pycache__/ffmpeg.cpython-39.pyc,,
+torchaudio/_backend/__pycache__/soundfile.cpython-39.pyc,,
+torchaudio/_backend/__pycache__/soundfile_backend.cpython-39.pyc,,
+torchaudio/_backend/__pycache__/sox.cpython-39.pyc,,
+torchaudio/_backend/__pycache__/utils.cpython-39.pyc,,
+torchaudio/_backend/backend.py,sha256=CakqB9z_4ZtLsQTyMZbEbB0kTqpUe_gidHgObv9acyQ,1618
+torchaudio/_backend/common.py,sha256=h9R29RTTW2lqOiKYByETXfXWsrTH65uHxGDKw3bWj-s,1835
+torchaudio/_backend/ffmpeg.py,sha256=b6dr67sWg47uFJxIyWLXCtrdPMlgjYdWmj7n919Ph1M,11628
+torchaudio/_backend/soundfile.py,sha256=Dd-K6Tg_G3ze08hImvnAsO5lpAERpmUd9oxkNsGJUno,1757
+torchaudio/_backend/soundfile_backend.py,sha256=sVSEM2On6PMY7AbPqpfvE1u1Bg2_0aiSrZ4TclAFi_w,17833
+torchaudio/_backend/sox.py,sha256=UGF352HA_5kXebHEdxOlvyNgruYacxDkeJy4ErggRhI,3451
+torchaudio/_backend/utils.py,sha256=8HkmV_GCHZhD-XaKNQ60nGFsu7rVgr1SOhasN6v91aI,13616
+torchaudio/_extension/__init__.py,sha256=s6AzXocDcuh0mtYVUCOOjZ_mCmSCno8jbZN750YI-Ps,2276
+torchaudio/_extension/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/_extension/__pycache__/utils.cpython-39.pyc,,
+torchaudio/_extension/utils.py,sha256=wFDF8B6Q22UL3zyX8swZ-JjlDgm9-VdjeraslQr2yIY,6438
+torchaudio/_internal/__init__.py,sha256=80cpJfTS8977YYrU3q5p4DRAGAkqEJrmG9Lq2hEDpoo,251
+torchaudio/_internal/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/_internal/__pycache__/module_utils.cpython-39.pyc,,
+torchaudio/_internal/module_utils.py,sha256=d0Gf_DhaA-fEtxKHjWhDwYrBsH6CCk41eUi_9THhQ9k,3675
+torchaudio/backend/__init__.py,sha256=ckKT_tmcmc_Z43ZTlqJ39fwUbmv-j-mAP2BWp0sU4Tg,289
+torchaudio/backend/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/backend/__pycache__/_no_backend.cpython-39.pyc,,
+torchaudio/backend/__pycache__/_sox_io_backend.cpython-39.pyc,,
+torchaudio/backend/__pycache__/common.cpython-39.pyc,,
+torchaudio/backend/__pycache__/no_backend.cpython-39.pyc,,
+torchaudio/backend/__pycache__/soundfile_backend.cpython-39.pyc,,
+torchaudio/backend/__pycache__/sox_io_backend.cpython-39.pyc,,
+torchaudio/backend/_no_backend.py,sha256=CEpYJ0bZi937Z0q2JHdRVnDKd7HWlCUVR7rEVHE_xmE,782
+torchaudio/backend/_sox_io_backend.py,sha256=Ut3-QaqzaJ0MvNc7NdpMTST7_mZy1xSixGbDC7418Qk,11750
+torchaudio/backend/common.py,sha256=mn0l6GBwet7DvRQPURhYIHF-HrQFvEFVKM23PawfbH8,456
+torchaudio/backend/no_backend.py,sha256=xc-k0kqzYOEM6dvcIwiMDEaLutKrYXXCsCXF1IVFNHM,483
+torchaudio/backend/soundfile_backend.py,sha256=NFHAQcz8kwlgI7qIG1bYrbgzsjtDCkNn_Gmip6vBL6g,513
+torchaudio/backend/sox_io_backend.py,sha256=_DD1_6y4eV4MsIl2Clcxnq96k_EXT0XMu5p2Z61gnEs,491
+torchaudio/compliance/__init__.py,sha256=JNH_-dTQVmm55YwcVMuVvUYFWdXhGn4C__9S8IUsNoU,53
+torchaudio/compliance/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/compliance/__pycache__/kaldi.cpython-39.pyc,,
+torchaudio/compliance/kaldi.py,sha256=bS7qJgS3k8FK1RkMiNEoP3q0xhjeV_V4RHQ9jo_rqOM,37479
+torchaudio/datasets/__init__.py,sha256=hdHldm3OzoQLbI0kHj8tLxqwDhzMfedq0_t1kAK7ORg,1218
+torchaudio/datasets/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/cmuarctic.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/cmudict.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/commonvoice.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/dr_vctk.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/fluentcommands.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/gtzan.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/iemocap.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/librilight_limited.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/librimix.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/librispeech.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/librispeech_biasing.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/libritts.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/ljspeech.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/musdb_hq.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/quesst14.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/snips.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/speechcommands.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/tedlium.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/utils.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/vctk.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/voxceleb1.cpython-39.pyc,,
+torchaudio/datasets/__pycache__/yesno.cpython-39.pyc,,
+torchaudio/datasets/cmuarctic.py,sha256=xEHBtO8oh5ub8VbLa1qcL-meFhYeg4EQpMUdiGaelGA,7254
+torchaudio/datasets/cmudict.py,sha256=_9vTz7_8BFVrcHeA61_-h2XLOl6IsdWCptkMWziOW7U,6176
+torchaudio/datasets/commonvoice.py,sha256=OcFn-nG4YfBIz0YIpH91xH9rFka8yFJmrxy4vFZkC4I,2849
+torchaudio/datasets/dr_vctk.py,sha256=Ayf85prDNr1LcWQ4bysVWdRVPry2JALjv6Mtq-6iBpY,4498
+torchaudio/datasets/fluentcommands.py,sha256=KnmH1Y28k5PhqQX6eV-75MqwTRxiHSUUcvAsa-K954s,3353
+torchaudio/datasets/gtzan.py,sha256=kt25Ly9qDGuiiVXgsXhS05tGi6laRhRko81-BQ4sZ-w,25475
+torchaudio/datasets/iemocap.py,sha256=ZMMG_FpcWcMHEbhuRYRQaUWi_DoegjxCrnVyCg5EEVE,5077
+torchaudio/datasets/librilight_limited.py,sha256=iwZBlSKVLrXzhZvaqjuVRGO6czxX4fpdzd8wWe5feWQ,4290
+torchaudio/datasets/librimix.py,sha256=AncE671AOl04dRPsajNZW-ZxxI_PwA2sjBftdBg4Q-k,5249
+torchaudio/datasets/librispeech.py,sha256=ys769I0UzG07UEmyZ_KDwATh4yc08hFUuCayK8tYIGg,6482
+torchaudio/datasets/librispeech_biasing.py,sha256=KEGplRU_wpgb0VqrT-t42kvtC7lg4uMssZcosVvvPhg,7147
+torchaudio/datasets/libritts.py,sha256=91Ep2Mq3OySre25GniXBLmRzTwEPiKmMaqXnzirn0xY,6038
+torchaudio/datasets/ljspeech.py,sha256=l09BSBQH76I-LhYkIRF0u18tTi-4yysaF4gj2GSZaxw,3601
+torchaudio/datasets/musdb_hq.py,sha256=FVlKsGEBHiT50y9GLswnt2QFph2PjiI6yCy1MxiG6f8,5214
+torchaudio/datasets/quesst14.py,sha256=3y6H3T3g78jkDqca8jORQBOViZhH1RhlsfuY8HJ2OcU,4591
+torchaudio/datasets/snips.py,sha256=mwVc5KsbMlPQJ87eyYgjnQ5S4EFXoQvm13dO0rXpJuE,5165
+torchaudio/datasets/speechcommands.py,sha256=_wmrKSiEe0COO7uk0JVXypBmNxu0urnceHuFQ6zMOk0,7664
+torchaudio/datasets/tedlium.py,sha256=UQZUaeUqmFntZWcH9HXOpGeW6tsCcG81bPjX2_CWxbg,8916
+torchaudio/datasets/utils.py,sha256=m-sBYgQb0JxgGVfsVpekKFDI_7PGXCTma6N2ymfJl0g,1743
+torchaudio/datasets/vctk.py,sha256=vN_VzxTLyHW11I_rzfzMVA3h5JW917FaU3NCnR-zcL0,5842
+torchaudio/datasets/voxceleb1.py,sha256=JlYkbyYOAFUFhGLULe3lgucANWf_G7qGqw47YjiX2IM,12034
+torchaudio/datasets/yesno.py,sha256=B3hRNUazvB8V8SwOUlQzliB9vI9gMkl9SEl-dZ4PEaw,3115
+torchaudio/functional/__init__.py,sha256=NwlPoWjNravX4itCZy-dzN-BkCFdDHFRwvPle8JjBRo,2484
+torchaudio/functional/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/functional/__pycache__/_alignment.cpython-39.pyc,,
+torchaudio/functional/__pycache__/filtering.cpython-39.pyc,,
+torchaudio/functional/__pycache__/functional.cpython-39.pyc,,
+torchaudio/functional/_alignment.py,sha256=46GhuEYUqI1gE-2UKSu9BIQF1QpZ5yunUS8JZGZJuss,4823
+torchaudio/functional/filtering.py,sha256=L3PaGnp7QKXwL4JECNSAhAVP92rrR8BGfdXoybSrHFU,63114
+torchaudio/functional/functional.py,sha256=qjdU3kILy69-jQL3lQ2EO1y_dpLDif_jAp99hBRJnPo,98541
+torchaudio/io/__init__.py,sha256=5U3MlGVGw5vPTZqCZ-7N25oVfwssA6KUtluj-9rNRMM,310
+torchaudio/io/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/io/__pycache__/_effector.cpython-39.pyc,,
+torchaudio/io/__pycache__/_playback.cpython-39.pyc,,
+torchaudio/io/_effector.py,sha256=5Kh7br-ZuLzmoRSVXk5JNQ8NkwcGUiY_mrt7d_1W1eg,12217
+torchaudio/io/_playback.py,sha256=UpPb-m35XUlYL2lybQGXAJAvfmUPT_Kqx4jpYArIAz0,2393
+torchaudio/kaldi_io.py,sha256=acwysr6fASV9IcOTF0AbVPCo_VQTu1M2AOn1SXm3GPE,5217
+torchaudio/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+torchaudio/lib/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/lib/_torchaudio.pyd,sha256=WV5FJJ0zngA8tfjY5dGw8UE-ul0KATMLjORqNkSLpak,747520
+torchaudio/lib/libtorchaudio.pyd,sha256=ls9OvefkxpldUFhB-ngN1c7IAI8tIZwy6Q8jN8cLY00,1052672
+torchaudio/models/__init__.py,sha256=Gi3UQvxjwTLW9wfKlF42O3Vup70d0bk2x-rZS89ASwI,2080
+torchaudio/models/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/models/__pycache__/_hdemucs.cpython-39.pyc,,
+torchaudio/models/__pycache__/conformer.cpython-39.pyc,,
+torchaudio/models/__pycache__/conv_tasnet.cpython-39.pyc,,
+torchaudio/models/__pycache__/deepspeech.cpython-39.pyc,,
+torchaudio/models/__pycache__/emformer.cpython-39.pyc,,
+torchaudio/models/__pycache__/rnnt.cpython-39.pyc,,
+torchaudio/models/__pycache__/rnnt_decoder.cpython-39.pyc,,
+torchaudio/models/__pycache__/tacotron2.cpython-39.pyc,,
+torchaudio/models/__pycache__/wav2letter.cpython-39.pyc,,
+torchaudio/models/__pycache__/wavernn.cpython-39.pyc,,
+torchaudio/models/_hdemucs.py,sha256=ipAj7965PO_WEZqQwW1om9gQj90UhQOeU6HU3Lpvzwo,39250
+torchaudio/models/conformer.py,sha256=gVrOYeJkPlVaX-4eZpVzNUe_r3k7g1Y6NaaQ8JZP-r4,10361
+torchaudio/models/conv_tasnet.py,sha256=D7Y10sOzLe03gygfN1J5R73SIHkIGVQOkqKQ6Ni3o_s,12870
+torchaudio/models/decoder/__init__.py,sha256=WMh4udN8CGF-SvgN6JBXSNMjhZqm1et7FyMrsk6V6RM,1252
+torchaudio/models/decoder/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/models/decoder/__pycache__/_ctc_decoder.cpython-39.pyc,,
+torchaudio/models/decoder/__pycache__/_cuda_ctc_decoder.cpython-39.pyc,,
+torchaudio/models/decoder/_ctc_decoder.py,sha256=woyUaDCuaMQqPTQ7uLuc99lxMAOsJj5AxWwS9hf6JNY,20650
+torchaudio/models/decoder/_cuda_ctc_decoder.py,sha256=BZCjAdZ50umWW171nJYHy24YZ5CxM8a2JfMIeO4S3BM,7373
+torchaudio/models/deepspeech.py,sha256=nVYc2xwWpFO6gu5CR0mbqLiAzJn8lAfHcdcP92i22mo,2830
+torchaudio/models/emformer.py,sha256=WbaeZcrPFOOLn4igqweE0AfuF_SQZpqg7XPGEhl7C8c,38650
+torchaudio/models/rnnt.py,sha256=PNJpZd3vH6wRq8TEf4UlPtVHbte9wOJ-bRMEug6gp08,36357
+torchaudio/models/rnnt_decoder.py,sha256=CBBMZhhq5Bgax0_3p3SZD-Os3S1LFHB91oTgVED4bmY,13178
+torchaudio/models/squim/__init__.py,sha256=eQox8kPviOthKulpzZvPK0a66NHW7MzYE4aOF7va_kU,357
+torchaudio/models/squim/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/models/squim/__pycache__/objective.cpython-39.pyc,,
+torchaudio/models/squim/__pycache__/subjective.cpython-39.pyc,,
+torchaudio/models/squim/objective.py,sha256=0Dsio0cQ_NBHg7t0YFbBamyiWPpocfaErddnBttu8b0,12615
+torchaudio/models/squim/subjective.py,sha256=1_gK9O3nvrjiikpP46IdsMzKduSTt91kKklA69wQqiw,5947
+torchaudio/models/tacotron2.py,sha256=mZ5lLSa75oqc0hgkc3sIm5_gK-knhtgX3dmg9-oLQao,46960
+torchaudio/models/wav2letter.py,sha256=oetxpH5RG0TadYB75IOmYOrnraaPvSlcSNpRZb2FE_A,3350
+torchaudio/models/wav2vec2/__init__.py,sha256=j5FdQFfuIpdIKYwoMLop4Ba70GGoS-lK61tU-oNG5wg,972
+torchaudio/models/wav2vec2/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/models/wav2vec2/__pycache__/components.cpython-39.pyc,,
+torchaudio/models/wav2vec2/__pycache__/model.cpython-39.pyc,,
+torchaudio/models/wav2vec2/__pycache__/wavlm_attention.cpython-39.pyc,,
+torchaudio/models/wav2vec2/components.py,sha256=EzmuGc5qHVPrHCGqYVHTvdjqP2gCrBfnHSoTK9GsZ1w,48244
+torchaudio/models/wav2vec2/model.py,sha256=kP6QKsF1PjleyUMhaPjydi0pCRy4GGUArRWBzfDJmdE,61671
+torchaudio/models/wav2vec2/utils/__init__.py,sha256=1eowaOEKRbp7JajFNv_r47REJqnMmXidukS7Mrwp_5Q,188
+torchaudio/models/wav2vec2/utils/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/models/wav2vec2/utils/__pycache__/import_fairseq.cpython-39.pyc,,
+torchaudio/models/wav2vec2/utils/__pycache__/import_huggingface.cpython-39.pyc,,
+torchaudio/models/wav2vec2/utils/import_fairseq.py,sha256=so7T-otDNCsTUtzJRUFFGWyd0caWl3RY_UbFMxJ4DJE,9411
+torchaudio/models/wav2vec2/utils/import_huggingface.py,sha256=NMK6YrAIDfOw8j1tV-3XTwx_mwbJHvg8ldTrAWRztIM,6080
+torchaudio/models/wav2vec2/wavlm_attention.py,sha256=iYde9grsb_RaEs87FI5ykyN3z0Ix1plqpsMNvakAiWM,11058
+torchaudio/models/wavernn.py,sha256=LRgL36jA6WzI1PAzBY6P52oCMGSTOraXB8fEgkwpSxw,15855
+torchaudio/pipelines/__init__.py,sha256=oMwOu-1T_ugJmhdaoI5NrCDrUAGrpDOlJQO8h-bLAW4,2847
+torchaudio/pipelines/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/pipelines/__pycache__/_source_separation_pipeline.cpython-39.pyc,,
+torchaudio/pipelines/__pycache__/_squim_pipeline.cpython-39.pyc,,
+torchaudio/pipelines/__pycache__/rnnt_pipeline.cpython-39.pyc,,
+torchaudio/pipelines/_source_separation_pipeline.py,sha256=WAjiWSlk73VS985GpKweDAfk2aKwQWI6jnGNwYHiRi8,4333
+torchaudio/pipelines/_squim_pipeline.py,sha256=ChaNVSQwwT8ge6XEDJCppu2W5Cz7Ng1Y9PkHhPlVwRg,7346
+torchaudio/pipelines/_tts/__init__.py,sha256=WKc5c06b_M9MvEohJZghJJWAL7vXvfwRIkdy85UCh04,442
+torchaudio/pipelines/_tts/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/pipelines/_tts/__pycache__/impl.cpython-39.pyc,,
+torchaudio/pipelines/_tts/__pycache__/interface.cpython-39.pyc,,
+torchaudio/pipelines/_tts/__pycache__/utils.cpython-39.pyc,,
+torchaudio/pipelines/_tts/impl.py,sha256=wwrTyTEEkew22AnzB_ZklapGaAstJSUBawhA7bOcGXM,15759
+torchaudio/pipelines/_tts/interface.py,sha256=y1mU0446Vy2hHpCwMqRZt1UI4ZXl-C4tJp92EylwHh0,10479
+torchaudio/pipelines/_tts/utils.py,sha256=ZqqD-TXI6zkKvYtdAHlZikKlpEO00otvOKJTTjXFOGI,4844
+torchaudio/pipelines/_wav2vec2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+torchaudio/pipelines/_wav2vec2/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/pipelines/_wav2vec2/__pycache__/aligner.cpython-39.pyc,,
+torchaudio/pipelines/_wav2vec2/__pycache__/impl.cpython-39.pyc,,
+torchaudio/pipelines/_wav2vec2/__pycache__/utils.cpython-39.pyc,,
+torchaudio/pipelines/_wav2vec2/aligner.py,sha256=HOcthFgup97QMx9ZXCmkv6jdw_zxdRT-e_SilXEujNU,2796
+torchaudio/pipelines/_wav2vec2/impl.py,sha256=I6htNo4Wt5LPxX9Z8rmxarFE8BZOZBUFIU9T9k1k2Po,67260
+torchaudio/pipelines/_wav2vec2/utils.py,sha256=CVawfXmVGWY8mj-_6r4KO907BpF67WAVWHEHhycFIaM,7317
+torchaudio/pipelines/rnnt_pipeline.py,sha256=S0DLMPbt-lqNBWOcjG5KP2IfU1X_oTv95CVmSjxYJ2g,14129
+torchaudio/prototype/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+torchaudio/prototype/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/prototype/datasets/__init__.py,sha256=l2k9V6ujGOiiPUXtJqMT0sao0zIUqKOg-243PAVs7NM,51
+torchaudio/prototype/datasets/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/prototype/datasets/__pycache__/musan.cpython-39.pyc,,
+torchaudio/prototype/datasets/musan.py,sha256=eNwoPahmHFqdllmFY10D5L6ko6ZjHQI9ZdvJJ8b6vDU,2163
+torchaudio/prototype/functional/__init__.py,sha256=C5uPqrnwc_VJ9ajJpudsBAmxt6-RKrHirIJa3RHLlfs,588
+torchaudio/prototype/functional/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/prototype/functional/__pycache__/_dsp.cpython-39.pyc,,
+torchaudio/prototype/functional/__pycache__/_rir.cpython-39.pyc,,
+torchaudio/prototype/functional/__pycache__/functional.cpython-39.pyc,,
+torchaudio/prototype/functional/_dsp.py,sha256=8mjlYBWIcqDidflvvZYtUI1Icx5hEBkyQE4XErDcHKw,17071
+torchaudio/prototype/functional/_rir.py,sha256=56To55TZ9J8zTXmGEb3N14df6ADpuXAWfdfxRCe8OEA,17634
+torchaudio/prototype/functional/functional.py,sha256=1alp81YX8x9DHBrIHz5RgxYrW-DEvPI59Io9p19kb64,6654
+torchaudio/prototype/models/__init__.py,sha256=Yuebowh-ukX2wxlU-rGc00BVjgChr_8Wf43wpTXTLus,1290
+torchaudio/prototype/models/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/prototype/models/__pycache__/_conformer_wav2vec2.cpython-39.pyc,,
+torchaudio/prototype/models/__pycache__/_emformer_hubert.cpython-39.pyc,,
+torchaudio/prototype/models/__pycache__/conv_emformer.cpython-39.pyc,,
+torchaudio/prototype/models/__pycache__/hifi_gan.cpython-39.pyc,,
+torchaudio/prototype/models/__pycache__/rnnt.cpython-39.pyc,,
+torchaudio/prototype/models/__pycache__/rnnt_decoder.cpython-39.pyc,,
+torchaudio/prototype/models/_conformer_wav2vec2.py,sha256=9Je5xp87iTzD86xUQ3h4cs0XXQYp-NLAC-tHgdiljjo,30316
+torchaudio/prototype/models/_emformer_hubert.py,sha256=Uz_hKOOzK8ul5rpBx__15_oy_pklocl8SvRgmjTg7hI,13831
+torchaudio/prototype/models/conv_emformer.py,sha256=txtyonaqdXG6fli91WN1mkAc6SFFzUZ3fFcY713P5VM,23601
+torchaudio/prototype/models/hifi_gan.py,sha256=X8mN04yQpzbOKxKmE98nFimltOOKp4FmtT2Ipxc4N3k,12816
+torchaudio/prototype/models/rnnt.py,sha256=3-O5pYW35ffEG8KwCVDQfhGp-Kj0OsP4wjzdTkdQEzo,31570
+torchaudio/prototype/models/rnnt_decoder.py,sha256=CO8yo1OsIi0EQSn3GRTysPoUzQN-aiNWyrrv_86rzc4,16134
+torchaudio/prototype/pipelines/__init__.py,sha256=6x8q20JhZrPYx-GtJpLzeqetS0U6xBt70Qn6ctGpuUE,394
+torchaudio/prototype/pipelines/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/prototype/pipelines/__pycache__/hifigan_pipeline.cpython-39.pyc,,
+torchaudio/prototype/pipelines/__pycache__/rnnt_pipeline.cpython-39.pyc,,
+torchaudio/prototype/pipelines/_vggish/__init__.py,sha256=pkGI6k0g21XZYg6H80RO6EavTOFBNUyQkFOanr8cwtY,92
+torchaudio/prototype/pipelines/_vggish/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/prototype/pipelines/_vggish/__pycache__/_vggish_impl.cpython-39.pyc,,
+torchaudio/prototype/pipelines/_vggish/__pycache__/_vggish_pipeline.cpython-39.pyc,,
+torchaudio/prototype/pipelines/_vggish/_vggish_impl.py,sha256=2elMGpd6-RCFLBS4WkEQSErGWVNK6iFTNMz95PpTIZ4,8730
+torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py,sha256=Gssicwkm7Wffnk1xUUoFPSUm9_5Na0J3MZAS5OdYfGo,2795
+torchaudio/prototype/pipelines/hifigan_pipeline.py,sha256=-HcU3w5AcCEcjVeXpIddVLJmeu52psCO4oTIOgw50WA,9882
+torchaudio/prototype/pipelines/rnnt_pipeline.py,sha256=6zeezoHIPo-9Sc0B19cFoJrQMybmDP1MDoUZzMFheOo,2242
+torchaudio/prototype/transforms/__init__.py,sha256=a-LNmupvUQvpo3CrTvqXudgY8G6cRGI1zy6j9oWST_o,234
+torchaudio/prototype/transforms/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/prototype/transforms/__pycache__/_transforms.cpython-39.pyc,,
+torchaudio/prototype/transforms/_transforms.py,sha256=DSktWkL7elGrNrNWCzZ0kaB7DNZmpX0gu_8kTJzXkf8,19600
+torchaudio/sox_effects/__init__.py,sha256=NVN6rAkHxizmOsZgLnxjMX5qXcPkABzLE-hvTMaSbEw,272
+torchaudio/sox_effects/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/sox_effects/__pycache__/sox_effects.cpython-39.pyc,,
+torchaudio/sox_effects/sox_effects.py,sha256=1K7Ngy5E1i1keSMiX0GJZbVh6n8ONH7TCb_606vqBxg,11253
+torchaudio/transforms/__init__.py,sha256=GYkPl29GcVu_QzUfnlw1QnfNsqiqgjtn1ZmfhAAMACo,1345
+torchaudio/transforms/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/transforms/__pycache__/_multi_channel.cpython-39.pyc,,
+torchaudio/transforms/__pycache__/_transforms.cpython-39.pyc,,
+torchaudio/transforms/_multi_channel.py,sha256=Musw7dTu25HNjKeIcKHUDuqBmj_GC2e3TaakqJcffW8,22688
+torchaudio/transforms/_transforms.py,sha256=YFbbZ8nV1fBR0EGiNS3GOtFM-rLkyDUpXdPh8zNab40,89009
+torchaudio/utils/__init__.py,sha256=h4Jvrb4vzdxzgJqgzA-TOUqLSZ2mRVALERR8am7BlvQ,185
+torchaudio/utils/__pycache__/__init__.cpython-39.pyc,,
+torchaudio/utils/__pycache__/download.cpython-39.pyc,,
+torchaudio/utils/__pycache__/ffmpeg_utils.cpython-39.pyc,,
+torchaudio/utils/__pycache__/sox_utils.cpython-39.pyc,,
+torchaudio/utils/download.py,sha256=QlO5md3u0bUBFTWjZpSBMaMeeqgZKm9LmwzdB5Ip7_c,2971
+torchaudio/utils/ffmpeg_utils.py,sha256=1r5cdbhz9ZCY5jW-5_gQ5G360a2fEwd--GBFMq_TxVk,330
+torchaudio/utils/sox_utils.py,sha256=Wpu9cEL3EcsovNnWKWIcosRSA_LmP1XqbZ7_9ti5imI,2520
+torchaudio/version.py,sha256=sx7VDGP3v5EEWJRaR1j4udJclNQC7ql5ZZM_VGZ6skU,85
+torio/__init__.py,sha256=6Rz28GL44aSOszXJewvjdcm8Fp47TgphNMPtsIBd2aE,119
+torio/__pycache__/__init__.cpython-39.pyc,,
+torio/_extension/__init__.py,sha256=9GnFiLWPCViTbUUNio9At1M0ALGqKtZ9lFOuPUn1Sc8,326
+torio/_extension/__pycache__/__init__.cpython-39.pyc,,
+torio/_extension/__pycache__/utils.cpython-39.pyc,,
+torio/_extension/utils.py,sha256=ppIGBFk868z7QbfSjawHUkSO3yZ7ML2jHFgE-j6GymI,5051
+torio/io/__init__.py,sha256=GSt-4DRzgiuVmNN3WwjDAMACztJidmEP5ghVOlW6OQI,235
+torio/io/__pycache__/__init__.cpython-39.pyc,,
+torio/io/__pycache__/_streaming_media_decoder.cpython-39.pyc,,
+torio/io/__pycache__/_streaming_media_encoder.cpython-39.pyc,,
+torio/io/_streaming_media_decoder.py,sha256=dx0K8PD2PZY7yRY1G_As-_8-LyQDLdYfRZPW1kmrJg0,35354
+torio/io/_streaming_media_encoder.py,sha256=C4zIasotf7GlkQqtRK3vMCt2aN6FkG6NK2KUw0ZdHHo,20224
+torio/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+torio/lib/__pycache__/__init__.cpython-39.pyc,,
+torio/lib/_torio_ffmpeg4.pyd,sha256=FEzc9-479g0lidzNahehJyGK44pMmbKrJVGenEZEgJA,1710080
+torio/lib/_torio_ffmpeg5.pyd,sha256=yKkS3AaRtlB1sg673VF5RrUtpJYA-0fM0x_n1qLqRkw,1710080
+torio/lib/_torio_ffmpeg6.pyd,sha256=pU2aQrr2SfXXWYYvSZ48OGO2NOK0TZV6MfrjkJLwM4c,1710080
+torio/lib/libtorio_ffmpeg4.pyd,sha256=2D0wyPqd2ZIVxEqmNd2AAYTmL5vKOz3qs9aKO4FF8pU,964096
+torio/lib/libtorio_ffmpeg5.pyd,sha256=F3AcMUfSITxcaWFnJNqONIDWh1YBrsK9kiL1_sBdFsQ,964096
+torio/lib/libtorio_ffmpeg6.pyd,sha256=-P_lsD_grPs8354Qk4Iqo0BI8KEqR44lzrspy1_FfZg,964096
+torio/utils/__init__.py,sha256=uQV58SlyikUr6yF4HITASCvuX-_fnzbeDxFRzFucQE4,60
+torio/utils/__pycache__/__init__.cpython-39.pyc,,
+torio/utils/__pycache__/ffmpeg_utils.cpython-39.pyc,,
+torio/utils/ffmpeg_utils.py,sha256=2-7XS7CEZB0-M9-Ls5Tki4v7aXGJiVg7WouAUZjt3XI,8273
diff --git a/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/REQUESTED b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/REQUESTED
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/WHEEL b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/WHEEL
new file mode 100644
index 0000000000000000000000000000000000000000..d5a98374e27e610e1b9b96bd43954d22ff48a4c2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/WHEEL
@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: bdist_wheel (0.37.1)
+Root-Is-Purelib: false
+Tag: cp39-cp39-win_amd64
+
diff --git a/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/top_level.txt b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9f448fc64e7113394edf208556101c579616cc18
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio-2.3.1.dist-info/top_level.txt
@@ -0,0 +1,2 @@
+torchaudio
+torio
diff --git a/MLPY/Lib/site-packages/torchaudio/__init__.py b/MLPY/Lib/site-packages/torchaudio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7fda5803bfcf88c77a9a50cb8e33544810710cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/__init__.py
@@ -0,0 +1,53 @@
+# Initialize extension and backend first
+from . import _extension  # noqa  # usort: skip
+from ._backend import (  # noqa  # usort: skip
+    AudioMetaData,
+    get_audio_backend,
+    info,
+    list_audio_backends,
+    load,
+    save,
+    set_audio_backend,
+)
+
+from . import (  # noqa: F401
+    compliance,
+    datasets,
+    functional,
+    io,
+    kaldi_io,
+    models,
+    pipelines,
+    sox_effects,
+    transforms,
+    utils,
+)
+
+# For BC
+from . import backend  # noqa # usort: skip
+
+try:
+    from .version import __version__, git_version  # noqa: F401
+except ImportError:
+    pass
+
+
+__all__ = [
+    "AudioMetaData",
+    "load",
+    "info",
+    "save",
+    "io",
+    "compliance",
+    "datasets",
+    "functional",
+    "models",
+    "pipelines",
+    "kaldi_io",
+    "utils",
+    "sox_effects",
+    "transforms",
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f43dfbb0ae2654732c421dc30c18c565e73b2614
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/__pycache__/kaldi_io.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/__pycache__/kaldi_io.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95c194aa500077a1284d13fe6bc5e7f2ac2fca1c
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/__pycache__/kaldi_io.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/__pycache__/version.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/__pycache__/version.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..307b7dad2f0d3b35ccaa257622edf40d293758e4
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/__pycache__/version.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/__init__.py b/MLPY/Lib/site-packages/torchaudio/_backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..42a833d1a19edb523aeb76aac260eb79db21fe58
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_backend/__init__.py
@@ -0,0 +1,61 @@
+from typing import List, Optional
+
+from torchaudio._internal.module_utils import deprecated
+
+from . import utils
+from .common import AudioMetaData
+
+__all__ = [
+    "AudioMetaData",
+    "load",
+    "info",
+    "save",
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]
+
+
+info = utils.get_info_func()
+load = utils.get_load_func()
+save = utils.get_save_func()
+
+
+def list_audio_backends() -> List[str]:
+    """List available backends
+
+    Returns:
+        list of str: The list of available backends.
+
+        The possible values are; ``"ffmpeg"``, ``"sox"`` and ``"soundfile"``.
+    """
+
+    return list(utils.get_available_backends().keys())
+
+
+# Temporary until global backend is removed
+@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
+def get_audio_backend() -> Optional[str]:
+    """Get the name of the current global backend
+
+    Returns:
+        str or None:
+            If dispatcher mode is enabled, returns ``None`` otherwise,
+            the name of current backend or ``None`` (no backend is set).
+    """
+    return None
+
+
+# Temporary until global backend is removed
+@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
+def set_audio_backend(backend: Optional[str]):  # noqa
+    """Set the global backend.
+
+    This is a no-op when dispatcher mode is enabled.
+
+    Args:
+        backend (str or None): Name of the backend.
+            One of ``"sox_io"`` or ``"soundfile"`` based on availability
+            of the system. If ``None`` is provided the  current backend is unassigned.
+    """
+    pass
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8edec9ddb8c7ea1903bd4fa9f38664ab48c97d30
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/backend.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a8692ff67b122e7258688cf46c1308dfdb2903b
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/common.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0fa2a5b68c9f336197e647ec42ea5af10f21b10
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aec4f676219d524d62e1fc30b2fcbfba84f963b1
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14f6ac912080a6a45d866723b03e1597080f3da3
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d97bb07aa4e90b7c8498a0095f946ce544feab85
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/sox.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/sox.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88ec63ed3dc880c946743d040d891a879db7a40a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/sox.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3337f12b3f27d9648728163af2712938f78990c7
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_backend/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/backend.py b/MLPY/Lib/site-packages/torchaudio/_backend/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cbf92f30d7d0e979e64019e6c7c4d4e6317cd2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_backend/backend.py
@@ -0,0 +1,53 @@
+import os
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Optional, Tuple, Union
+
+from torch import Tensor
+from torchaudio.io import CodecConfig
+
+from .common import AudioMetaData
+
+
+class Backend(ABC):
+    @staticmethod
+    @abstractmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[Tensor, int]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        raise NotImplementedError
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/common.py b/MLPY/Lib/site-packages/torchaudio/_backend/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3b68aa4ce8a375e2535c3c521489cc1801b000a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_backend/common.py
@@ -0,0 +1,52 @@
+class AudioMetaData:
+    """AudioMetaData()
+
+    Return type of ``torchaudio.info`` function.
+
+    :ivar int sample_rate: Sample rate
+    :ivar int num_frames: The number of frames
+    :ivar int num_channels: The number of channels
+    :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
+        or when it cannot be accurately inferred.
+    :ivar str encoding: Audio encoding
+        The values encoding can take are one of the following:
+
+            * ``PCM_S``: Signed integer linear PCM
+            * ``PCM_U``: Unsigned integer linear PCM
+            * ``PCM_F``: Floating point linear PCM
+            * ``FLAC``: Flac, Free Lossless Audio Codec
+            * ``ULAW``: Mu-law
+            * ``ALAW``: A-law
+            * ``MP3`` : MP3, MPEG-1 Audio Layer III
+            * ``VORBIS``: OGG Vorbis
+            * ``AMR_WB``: Adaptive Multi-Rate Wideband
+            * ``AMR_NB``: Adaptive Multi-Rate Narrowband
+            * ``OPUS``: Opus
+            * ``HTK``: Single channel 16-bit PCM
+            * ``UNKNOWN`` : None of above
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        num_frames: int,
+        num_channels: int,
+        bits_per_sample: int,
+        encoding: str,
+    ):
+        self.sample_rate = sample_rate
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.bits_per_sample = bits_per_sample
+        self.encoding = encoding
+
+    def __str__(self):
+        return (
+            f"AudioMetaData("
+            f"sample_rate={self.sample_rate}, "
+            f"num_frames={self.num_frames}, "
+            f"num_channels={self.num_channels}, "
+            f"bits_per_sample={self.bits_per_sample}, "
+            f"encoding={self.encoding}"
+            f")"
+        )
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/ffmpeg.py b/MLPY/Lib/site-packages/torchaudio/_backend/ffmpeg.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b10ccdd67ac3849edb47f47e23df9187f3ed7ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_backend/ffmpeg.py
@@ -0,0 +1,334 @@
+import os
+import re
+import sys
+from typing import BinaryIO, Optional, Tuple, Union
+
+import torch
+import torchaudio
+
+from .backend import Backend
+from .common import AudioMetaData
+
+InputType = Union[BinaryIO, str, os.PathLike]
+
+
+def info_audio(
+    src: InputType,
+    format: Optional[str],
+    buffer_size: int = 4096,
+) -> AudioMetaData:
+    s = torchaudio.io.StreamReader(src, format, None, buffer_size)
+    sinfo = s.get_src_stream_info(s.default_audio_stream)
+    if sinfo.num_frames == 0:
+        waveform = _load_audio(s)
+        num_frames = waveform.size(1)
+    else:
+        num_frames = sinfo.num_frames
+    return AudioMetaData(
+        int(sinfo.sample_rate),
+        num_frames,
+        sinfo.num_channels,
+        sinfo.bits_per_sample,
+        sinfo.codec.upper(),
+    )
+
+
+def _get_load_filter(
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    convert: bool = True,
+) -> Optional[str]:
+    if frame_offset < 0:
+        raise RuntimeError("Invalid argument: frame_offset must be non-negative. Found: {}".format(frame_offset))
+    if num_frames == 0 or num_frames < -1:
+        raise RuntimeError("Invalid argument: num_frames must be -1 or greater than 0. Found: {}".format(num_frames))
+
+    # All default values -> no filter
+    if frame_offset == 0 and num_frames == -1 and not convert:
+        return None
+    # Only convert
+    aformat = "aformat=sample_fmts=fltp"
+    if frame_offset == 0 and num_frames == -1 and convert:
+        return aformat
+    # At least one of frame_offset or num_frames has non-default value
+    if num_frames > 0:
+        atrim = "atrim=start_sample={}:end_sample={}".format(frame_offset, frame_offset + num_frames)
+    else:
+        atrim = "atrim=start_sample={}".format(frame_offset)
+    if not convert:
+        return atrim
+    return "{},{}".format(atrim, aformat)
+
+
+def _load_audio(
+    s: "torchaudio.io.StreamReader",
+    filter: Optional[str] = None,
+    channels_first: bool = True,
+) -> torch.Tensor:
+    s.add_audio_stream(-1, -1, filter_desc=filter)
+    s.process_all_packets()
+    chunk = s.pop_chunks()[0]
+    if chunk is None:
+        raise RuntimeError("Failed to decode audio.")
+    waveform = chunk._elem
+    return waveform.T if channels_first else waveform
+
+
+def load_audio(
+    src: InputType,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    convert: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+    buffer_size: int = 4096,
+) -> Tuple[torch.Tensor, int]:
+    if hasattr(src, "read") and format == "vorbis":
+        format = "ogg"
+    s = torchaudio.io.StreamReader(src, format, None, buffer_size)
+    sample_rate = int(s.get_src_stream_info(s.default_audio_stream).sample_rate)
+    filter = _get_load_filter(frame_offset, num_frames, convert)
+    waveform = _load_audio(s, filter, channels_first)
+    return waveform, sample_rate
+
+
+def _get_sample_format(dtype: torch.dtype) -> str:
+    dtype_to_format = {
+        torch.uint8: "u8",
+        torch.int16: "s16",
+        torch.int32: "s32",
+        torch.int64: "s64",
+        torch.float32: "flt",
+        torch.float64: "dbl",
+    }
+    format = dtype_to_format.get(dtype)
+    if format is None:
+        raise ValueError(f"No format found for dtype {dtype}; dtype must be one of {list(dtype_to_format.keys())}.")
+    return format
+
+
+def _native_endianness() -> str:
+    if sys.byteorder == "little":
+        return "le"
+    else:
+        return "be"
+
+
+def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str:
+    if bits_per_sample not in {None, 8, 16, 24, 32, 64}:
+        raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.")
+    endianness = _native_endianness()
+    if not encoding:
+        if not bits_per_sample:
+            # default to PCM S16
+            return f"pcm_s16{endianness}"
+        if bits_per_sample == 8:
+            return "pcm_u8"
+        return f"pcm_s{bits_per_sample}{endianness}"
+    if encoding == "PCM_S":
+        if not bits_per_sample:
+            bits_per_sample = 16
+        if bits_per_sample == 8:
+            raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.")
+        return f"pcm_s{bits_per_sample}{endianness}"
+    if encoding == "PCM_U":
+        if bits_per_sample in (None, 8):
+            return "pcm_u8"
+        raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.")
+    if encoding == "PCM_F":
+        if not bits_per_sample:
+            bits_per_sample = 32
+        if bits_per_sample in (32, 64):
+            return f"pcm_f{bits_per_sample}{endianness}"
+        raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "pcm_mulaw"
+        raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.")
+    if encoding == "ALAW":
+        if bits_per_sample in (None, 8):
+            return "pcm_alaw"
+        raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.")
+    raise ValueError(f"WAV encoding {encoding} is not supported.")
+
+
+def _get_flac_sample_fmt(bps):
+    if bps is None or bps == 16:
+        return "s16"
+    if bps == 24:
+        return "s32"
+    raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).")
+
+
+def _parse_save_args(
+    ext: Optional[str],
+    format: Optional[str],
+    encoding: Optional[str],
+    bps: Optional[int],
+):
+    # torchaudio's save function accepts the followings, which do not 1to1 map
+    # to FFmpeg.
+    #
+    # - format: audio format
+    # - bits_per_sample: encoder sample format
+    # - encoding: such as PCM_U8.
+    #
+    # In FFmpeg, format is specified with the following three (and more)
+    #
+    # - muxer: could be audio format or container format.
+    # the one we passed to the constructor of StreamWriter
+    # - encoder: the audio encoder used to encode audio
+    # - encoder sample format: the format used by encoder to encode audio.
+    #
+    # If encoder sample format is different from source sample format, StreamWriter
+    # will insert a filter automatically.
+    #
+    def _type(spec):
+        # either format is exactly the specified one
+        # or extension matches to the spec AND there is no format override.
+        return format == spec or (format is None and ext == spec)
+
+    if _type("wav") or _type("amb"):
+        # wav is special because it supports different encoding through encoders
+        # each encoder only supports one encoder format
+        #
+        # amb format is a special case originated from libsox.
+        # It is basically a WAV format, with slight modification.
+        # https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795
+        # It is a format so that decoders will recognize it as ambisonic.
+        # https://www.ambisonia.com/Members/mleese/file-format-for-b-format/
+        # FFmpeg does not recognize amb because it is basically a WAV format.
+        muxer = "wav"
+        encoder = _get_encoder_for_wav(encoding, bps)
+        sample_fmt = None
+    elif _type("vorbis"):
+        # FFpmeg does not recognize vorbis extension, while libsox used to do.
+        # For the sake of bakward compatibility, (and the simplicity),
+        # we support the case where users want to do save("foo.vorbis")
+        muxer = "ogg"
+        encoder = "vorbis"
+        sample_fmt = None
+    else:
+        muxer = format
+        encoder = None
+        sample_fmt = None
+        if _type("flac"):
+            sample_fmt = _get_flac_sample_fmt(bps)
+        if _type("ogg"):
+            sample_fmt = _get_flac_sample_fmt(bps)
+    return muxer, encoder, sample_fmt
+
+
+def save_audio(
+    uri: InputType,
+    src: torch.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+    buffer_size: int = 4096,
+    compression: Optional[torchaudio.io.CodecConfig] = None,
+) -> None:
+    ext = None
+    if hasattr(uri, "write"):
+        if format is None:
+            raise RuntimeError("'format' is required when saving to file object.")
+    else:
+        uri = os.path.normpath(uri)
+        if tokens := str(uri).split(".")[1:]:
+            ext = tokens[-1].lower()
+
+    muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample)
+
+    if channels_first:
+        src = src.T
+
+    s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)
+    s.add_audio_stream(
+        sample_rate,
+        num_channels=src.size(-1),
+        format=_get_sample_format(src.dtype),
+        encoder=encoder,
+        encoder_format=enc_fmt,
+        codec_config=compression,
+    )
+    with s.open():
+        s.write_audio_chunk(0, src)
+
+
+def _map_encoding(encoding: str) -> str:
+    for dst in ["PCM_S", "PCM_U", "PCM_F"]:
+        if dst in encoding:
+            return dst
+    if encoding == "PCM_MULAW":
+        return "ULAW"
+    elif encoding == "PCM_ALAW":
+        return "ALAW"
+    return encoding
+
+
+def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
+    if m := re.search(r"PCM_\w(\d+)\w*", encoding):
+        return int(m.group(1))
+    elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
+        return 8
+    return bits_per_sample
+
+
+class FFmpegBackend(Backend):
+    @staticmethod
+    def info(uri: InputType, format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        metadata = info_audio(uri, format, buffer_size)
+        metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
+        metadata.encoding = _map_encoding(metadata.encoding)
+        return metadata
+
+    @staticmethod
+    def load(
+        uri: InputType,
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        return load_audio(uri, frame_offset, num_frames, normalize, channels_first, format)
+
+    @staticmethod
+    def save(
+        uri: InputType,
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
+    ) -> None:
+        if not isinstance(compression, (torchaudio.io.CodecConfig, type(None))):
+            raise ValueError(
+                "FFmpeg backend expects non-`None` value for argument `compression` to be of ",
+                f"type `torchaudio.io.CodecConfig`, but received value of type {type(compression)}",
+            )
+        save_audio(
+            uri,
+            src,
+            sample_rate,
+            channels_first,
+            format,
+            encoding,
+            bits_per_sample,
+            buffer_size,
+            compression,
+        )
+
+    @staticmethod
+    def can_decode(uri: InputType, format: Optional[str]) -> bool:
+        return True
+
+    @staticmethod
+    def can_encode(uri: InputType, format: Optional[str]) -> bool:
+        return True
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/soundfile.py b/MLPY/Lib/site-packages/torchaudio/_backend/soundfile.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0ac8f0837264e43a810feb675470da5220dc9a6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_backend/soundfile.py
@@ -0,0 +1,54 @@
+import os
+from typing import BinaryIO, Optional, Tuple, Union
+
+import torch
+from torchaudio.io import CodecConfig
+
+from . import soundfile_backend
+from .backend import Backend
+from .common import AudioMetaData
+
+
+class SoundfileBackend(Backend):
+    @staticmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        return soundfile_backend.info(uri, format)
+
+    @staticmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
+
+    @staticmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ) -> None:
+        if compression:
+            raise ValueError("soundfile backend does not support argument `compression`.")
+
+        soundfile_backend.save(
+            uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
+        )
+
+    @staticmethod
+    def can_decode(uri, format) -> bool:
+        return True
+
+    @staticmethod
+    def can_encode(uri, format) -> bool:
+        return True
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/soundfile_backend.py b/MLPY/Lib/site-packages/torchaudio/_backend/soundfile_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..df3eea6aba85fa3bfc3ac0b069572aa4e33e03c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_backend/soundfile_backend.py
@@ -0,0 +1,457 @@
+"""The new soundfile backend which will become default in 0.8.0 onward"""
+import warnings
+from typing import Optional, Tuple
+
+import torch
+from torchaudio._internal import module_utils as _mod_utils
+
+from .common import AudioMetaData
+
+
+_IS_SOUNDFILE_AVAILABLE = False
+
+# TODO: import soundfile only when it is used.
+if _mod_utils.is_module_available("soundfile"):
+    try:
+        import soundfile
+
+        _requires_soundfile = _mod_utils.no_op
+        _IS_SOUNDFILE_AVAILABLE = True
+    except Exception:
+        _requires_soundfile = _mod_utils.fail_with_message(
+            "requires soundfile, but we failed to import it. Please check the installation of soundfile."
+        )
+else:
+    _requires_soundfile = _mod_utils.fail_with_message(
+        "requires soundfile, but it is not installed. Please install soundfile."
+    )
+
+
+# Mapping from soundfile subtype to number of bits per sample.
+# This is mostly heuristical and the value is set to 0 when it is irrelevant
+# (lossy formats) or when it can't be inferred.
+# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
+# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
+# the default seems to be 8 bits but it can be compressed further to 4 bits.
+# The dict is inspired from
+# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
+_SUBTYPE_TO_BITS_PER_SAMPLE = {
+    "PCM_S8": 8,  # Signed 8 bit data
+    "PCM_16": 16,  # Signed 16 bit data
+    "PCM_24": 24,  # Signed 24 bit data
+    "PCM_32": 32,  # Signed 32 bit data
+    "PCM_U8": 8,  # Unsigned 8 bit data (WAV and RAW only)
+    "FLOAT": 32,  # 32 bit float data
+    "DOUBLE": 64,  # 64 bit float data
+    "ULAW": 8,  # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "ALAW": 8,  # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "IMA_ADPCM": 0,  # IMA ADPCM.
+    "MS_ADPCM": 0,  # Microsoft ADPCM.
+    "GSM610": 0,  # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
+    "VOX_ADPCM": 0,  # OKI / Dialogix ADPCM
+    "G721_32": 0,  # 32kbs G721 ADPCM encoding.
+    "G723_24": 0,  # 24kbs G723 ADPCM encoding.
+    "G723_40": 0,  # 40kbs G723 ADPCM encoding.
+    "DWVW_12": 12,  # 12 bit Delta Width Variable Word encoding.
+    "DWVW_16": 16,  # 16 bit Delta Width Variable Word encoding.
+    "DWVW_24": 24,  # 24 bit Delta Width Variable Word encoding.
+    "DWVW_N": 0,  # N bit Delta Width Variable Word encoding.
+    "DPCM_8": 8,  # 8 bit differential PCM (XI only)
+    "DPCM_16": 16,  # 16 bit differential PCM (XI only)
+    "VORBIS": 0,  # Xiph Vorbis encoding. (lossy)
+    "ALAC_16": 16,  # Apple Lossless Audio Codec (16 bit).
+    "ALAC_20": 20,  # Apple Lossless Audio Codec (20 bit).
+    "ALAC_24": 24,  # Apple Lossless Audio Codec (24 bit).
+    "ALAC_32": 32,  # Apple Lossless Audio Codec (32 bit).
+}
+
+
+def _get_bit_depth(subtype):
+    if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
+        warnings.warn(
+            f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
+            "attribute will be set to 0. If you are seeing this warning, please "
+            "report by opening an issue on github (after checking for existing/closed ones). "
+            "You may otherwise ignore this warning."
+        )
+    return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
+
+
+_SUBTYPE_TO_ENCODING = {
+    "PCM_S8": "PCM_S",
+    "PCM_16": "PCM_S",
+    "PCM_24": "PCM_S",
+    "PCM_32": "PCM_S",
+    "PCM_U8": "PCM_U",
+    "FLOAT": "PCM_F",
+    "DOUBLE": "PCM_F",
+    "ULAW": "ULAW",
+    "ALAW": "ALAW",
+    "VORBIS": "VORBIS",
+}
+
+
+def _get_encoding(format: str, subtype: str):
+    if format == "FLAC":
+        return "FLAC"
+    return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
+
+
+@_requires_soundfile
+def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
+    """Get signal information of an audio file.
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+
+    Returns:
+        AudioMetaData: meta data of the given audio.
+
+    """
+    sinfo = soundfile.info(filepath)
+    return AudioMetaData(
+        sinfo.samplerate,
+        sinfo.frames,
+        sinfo.channels,
+        bits_per_sample=_get_bit_depth(sinfo.subtype),
+        encoding=_get_encoding(sinfo.format, sinfo.subtype),
+    )
+
+
+_SUBTYPE2DTYPE = {
+    "PCM_S8": "int8",
+    "PCM_U8": "uint8",
+    "PCM_16": "int16",
+    "PCM_32": "int32",
+    "FLOAT": "float32",
+    "DOUBLE": "float64",
+}
+
+
+@_requires_soundfile
+def load(
+    filepath: str,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Load audio data from file.
+
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+
+        * WAV
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+
+    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+    ``float32`` dtype, and the shape of `[channel, time]`.
+
+    .. warning::
+
+       ``normalize`` argument does not perform volume normalization.
+       It only converts the sample type to `torch.float32` from the native sample
+       type.
+
+       When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+       signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
+       this function can return integer Tensor, where the samples are expressed within the whole range
+       of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
+       ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
+       support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
+
+       ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
+       ``flac`` and ``mp3``.
+
+       For these formats, this function always returns ``float32`` Tensor with values.
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        frame_offset (int, optional):
+            Number of frames to skip before start reading data.
+        num_frames (int, optional):
+            Maximum number of frames to read. ``-1`` reads all the remaining samples,
+            starting from ``frame_offset``.
+            This function may return the less number of frames if there is not enough
+            frames in the given file.
+        normalize (bool, optional):
+            When ``True``, this function converts the native sample type to ``float32``.
+            Default: ``True``.
+
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type.
+            This argument has no effect for formats other than integer WAV type.
+
+        channels_first (bool, optional):
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+
+    Returns:
+        (torch.Tensor, int): Resulting Tensor and sample rate.
+            If the input file has integer wav format and normalization is off, then it has
+            integer type, else ``float32`` type. If ``channels_first=True``, it has
+            `[channel, time]` else `[time, channel]`.
+    """
+    with soundfile.SoundFile(filepath, "r") as file_:
+        if file_.format != "WAV" or normalize:
+            dtype = "float32"
+        elif file_.subtype not in _SUBTYPE2DTYPE:
+            raise ValueError(f"Unsupported subtype: {file_.subtype}")
+        else:
+            dtype = _SUBTYPE2DTYPE[file_.subtype]
+
+        frames = file_._prepare_read(frame_offset, None, num_frames)
+        waveform = file_.read(frames, dtype, always_2d=True)
+        sample_rate = file_.samplerate
+
+    waveform = torch.from_numpy(waveform)
+    if channels_first:
+        waveform = waveform.t()
+    return waveform, sample_rate
+
+
+def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
+    if not encoding:
+        if not bits_per_sample:
+            subtype = {
+                torch.uint8: "PCM_U8",
+                torch.int16: "PCM_16",
+                torch.int32: "PCM_32",
+                torch.float32: "FLOAT",
+                torch.float64: "DOUBLE",
+            }.get(dtype)
+            if not subtype:
+                raise ValueError(f"Unsupported dtype for wav: {dtype}")
+            return subtype
+        if bits_per_sample == 8:
+            return "PCM_U8"
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_S":
+        if not bits_per_sample:
+            return "PCM_32"
+        if bits_per_sample == 8:
+            raise ValueError("wav does not support 8-bit signed PCM encoding.")
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_U":
+        if bits_per_sample in (None, 8):
+            return "PCM_U8"
+        raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
+    if encoding == "PCM_F":
+        if bits_per_sample in (None, 32):
+            return "FLOAT"
+        if bits_per_sample == 64:
+            return "DOUBLE"
+        raise ValueError("wav only supports 32/64-bit float PCM encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("wav only supports 8-bit mu-law encoding.")
+    if encoding == "ALAW":
+        if bits_per_sample in (None, 8):
+            return "ALAW"
+        raise ValueError("wav only supports 8-bit a-law encoding.")
+    raise ValueError(f"wav does not support {encoding}.")
+
+
+def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
+    if encoding in (None, "PCM_S"):
+        return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
+    if encoding in ("PCM_U", "PCM_F"):
+        raise ValueError(f"sph does not support {encoding} encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("sph only supports 8-bit for mu-law encoding.")
+    if encoding == "ALAW":
+        return "ALAW"
+    raise ValueError(f"sph does not support {encoding}.")
+
+
+def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
+    if format == "wav":
+        return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
+    if format == "flac":
+        if encoding:
+            raise ValueError("flac does not support encoding.")
+        if not bits_per_sample:
+            return "PCM_16"
+        if bits_per_sample > 24:
+            raise ValueError("flac does not support bits_per_sample > 24.")
+        return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
+    if format in ("ogg", "vorbis"):
+        if bits_per_sample:
+            raise ValueError("ogg/vorbis does not support bits_per_sample.")
+        if encoding is None or encoding == "vorbis":
+            return "VORBIS"
+        if encoding == "opus":
+            return "OPUS"
+        raise ValueError(f"Unexpected encoding: {encoding}")
+    if format == "mp3":
+        return "MPEG_LAYER_III"
+    if format == "sph":
+        return _get_subtype_for_sphere(encoding, bits_per_sample)
+    if format in ("nis", "nist"):
+        return "PCM_16"
+    raise ValueError(f"Unsupported format: {format}")
+
+
+@_requires_soundfile
+def save(
+    filepath: str,
+    src: torch.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+):
+    """Save audio data to file.
+
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+
+        * WAV
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
+    Args:
+        filepath (str or pathlib.Path): Path to audio file.
+        src (torch.Tensor): Audio data to save. must be 2D tensor.
+        sample_rate (int): sampling rate
+        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+            otherwise `[time, channel]`.
+        compression (float of None, optional): Not used.
+            It is here only for interface compatibility reson with "sox_io" backend.
+        format (str or None, optional): Override the audio format.
+            When ``filepath`` argument is path-like object, audio format is
+            inferred from file extension. If the file extension is missing or
+            different, you can specify the correct format with this argument.
+
+            When ``filepath`` argument is file-like object,
+            this argument is required.
+
+            Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
+            ``"flac"`` and ``"sph"``.
+        encoding (str or None, optional): Changes the encoding for supported formats.
+            This argument is effective only for supported formats, sush as
+            ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
+
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+
+        bits_per_sample (int or None, optional): Changes the bit depth for the
+            supported formats.
+            When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
+            you can change the bit depth.
+            Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
+
+    Supported formats/encodings/bit depth/compression are:
+
+    ``"wav"``
+        - 32-bit floating-point PCM
+        - 32-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 8-bit unsigned integer PCM
+        - 8-bit mu-law
+        - 8-bit a-law
+
+        Note:
+            Default encoding/bit depth is determined by the dtype of
+            the input Tensor.
+
+    ``"flac"``
+        - 8-bit
+        - 16-bit (default)
+        - 24-bit
+
+    ``"ogg"``, ``"vorbis"``
+        - Doesn't accept changing configuration.
+
+    ``"sph"``
+        - 8-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 32-bit signed integer PCM (default)
+        - 8-bit mu-law
+        - 8-bit a-law
+        - 16-bit a-law
+        - 24-bit a-law
+        - 32-bit a-law
+
+    """
+    if src.ndim != 2:
+        raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
+    if compression is not None:
+        warnings.warn(
+            '`save` function of "soundfile" backend does not support "compression" parameter. '
+            "The argument is silently ignored."
+        )
+    if hasattr(filepath, "write"):
+        if format is None:
+            raise RuntimeError("`format` is required when saving to file object.")
+        ext = format.lower()
+    else:
+        ext = str(filepath).split(".")[-1].lower()
+
+    if bits_per_sample not in (None, 8, 16, 24, 32, 64):
+        raise ValueError("Invalid bits_per_sample.")
+    if bits_per_sample == 24:
+        warnings.warn(
+            "Saving audio with 24 bits per sample might warp samples near -1. "
+            "Using 16 bits per sample might be able to avoid this."
+        )
+    subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
+
+    # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
+    # so we extend the extensions manually here
+    if ext in ["nis", "nist", "sph"] and format is None:
+        format = "NIST"
+
+    if channels_first:
+        src = src.t()
+
+    soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/sox.py b/MLPY/Lib/site-packages/torchaudio/_backend/sox.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed03d329ea2fe5f9a35fa191feca82e68024e1c3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_backend/sox.py
@@ -0,0 +1,91 @@
+import os
+from typing import BinaryIO, Optional, Tuple, Union
+
+import torch
+import torchaudio
+
+from .backend import Backend
+from .common import AudioMetaData
+
+sox_ext = torchaudio._extension.lazy_import_sox_ext()
+
+
+class SoXBackend(Backend):
+    @staticmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        if hasattr(uri, "read"):
+            raise ValueError(
+                "SoX backend does not support reading from file-like objects. ",
+                "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            sinfo = sox_ext.get_info(uri, format)
+            if sinfo:
+                return AudioMetaData(*sinfo)
+            else:
+                raise RuntimeError(f"Failed to fetch metadata for {uri}.")
+
+    @staticmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        if hasattr(uri, "read"):
+            raise ValueError(
+                "SoX backend does not support loading from file-like objects. ",
+                "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            ret = sox_ext.load_audio_file(uri, frame_offset, num_frames, normalize, channels_first, format)
+            if not ret:
+                raise RuntimeError(f"Failed to load audio from {uri}.")
+            return ret
+
+    @staticmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
+    ) -> None:
+        if not isinstance(compression, (float, int, type(None))):
+            raise ValueError(
+                "SoX backend expects non-`None` value for argument `compression` to be of ",
+                f"type `float` or `int`, but received value of type {type(compression)}",
+            )
+        if hasattr(uri, "write"):
+            raise ValueError(
+                "SoX backend does not support writing to file-like objects. ",
+                "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            sox_ext.save_audio_file(
+                uri,
+                src,
+                sample_rate,
+                channels_first,
+                compression,
+                format,
+                encoding,
+                bits_per_sample,
+            )
+
+    @staticmethod
+    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        # i.e. not a file-like object.
+        return not hasattr(uri, "read")
+
+    @staticmethod
+    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        # i.e. not a file-like object.
+        return not hasattr(uri, "write")
diff --git a/MLPY/Lib/site-packages/torchaudio/_backend/utils.py b/MLPY/Lib/site-packages/torchaudio/_backend/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b072656ba2e3aebccabf39e051176198b58a7c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_backend/utils.py
@@ -0,0 +1,317 @@
+import os
+from functools import lru_cache
+from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
+
+import torch
+
+from torchaudio._extension import lazy_import_sox_ext
+from torchaudio.io import CodecConfig
+from torio._extension import lazy_import_ffmpeg_ext
+
+from . import soundfile_backend
+
+from .backend import Backend
+from .common import AudioMetaData
+from .ffmpeg import FFmpegBackend
+from .soundfile import SoundfileBackend
+from .sox import SoXBackend
+
+
+@lru_cache(None)
+def get_available_backends() -> Dict[str, Type[Backend]]:
+    backend_specs: Dict[str, Type[Backend]] = {}
+    if lazy_import_ffmpeg_ext().is_available():
+        backend_specs["ffmpeg"] = FFmpegBackend
+    if lazy_import_sox_ext().is_available():
+        backend_specs["sox"] = SoXBackend
+    if soundfile_backend._IS_SOUNDFILE_AVAILABLE:
+        backend_specs["soundfile"] = SoundfileBackend
+    return backend_specs
+
+
+def get_backend(backend_name, backends) -> Backend:
+    if backend := backends.get(backend_name):
+        return backend
+    else:
+        raise ValueError(
+            f"Unsupported backend '{backend_name}' specified; ",
+            f"please select one of {list(backends.keys())} instead.",
+        )
+
+
+def get_info_func():
+    backends = get_available_backends()
+
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+
+        for backend in backends.values():
+            if backend.can_decode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+
+    def info(
+        uri: Union[BinaryIO, str, os.PathLike],
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+    ) -> AudioMetaData:
+        """Get signal information of an audio file.
+
+        Note:
+            When the input type is file-like object, this function cannot
+            get the correct length (``num_samples``) for certain formats,
+            such as ``vorbis``.
+            In this case, the value of ``num_samples`` is ``0``.
+
+        Args:
+            uri (path-like object or file-like object):
+                Source of audio data. The following types are accepted:
+
+                * ``path-like``: File path or URL.
+                * ``file-like``: Object with ``read(size: int) -> bytes`` method,
+                  which returns byte string of at most ``size`` length.
+
+            format (str or None, optional):
+                If not ``None``, interpreted as hint that may allow backend to override the detected format.
+                (Default: ``None``)
+
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend available.
+                (Default: ``None``)
+
+                .. seealso::
+                   :ref:`backend`
+
+        Returns:
+            AudioMetaData
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.info(uri, format, buffer_size)
+
+    return info
+
+
+def get_load_func():
+    backends = get_available_backends()
+
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+
+        for backend in backends.values():
+            if backend.can_decode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+    ) -> Tuple[torch.Tensor, int]:
+        """Load audio data from source.
+
+        By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+        ``float32`` dtype, and the shape of `[channel, time]`.
+
+        Note:
+            The formats this function can handle depend on the availability of backends.
+            Please use the following functions to fetch the supported formats.
+
+            - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders`
+            - Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats`
+            - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
+
+        .. warning::
+
+            ``normalize`` argument does not perform volume normalization.
+            It only converts the sample type to `torch.float32` from the native sample
+            type.
+
+            When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+            signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
+            this function can return integer Tensor, where the samples are expressed within the whole range
+            of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
+            ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
+            support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
+
+            ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
+            ``flac`` and ``mp3``.
+
+            For these formats, this function always returns ``float32`` Tensor with values.
+
+
+        Args:
+            uri (path-like object or file-like object):
+                Source of audio data.
+            frame_offset (int, optional):
+                Number of frames to skip before start reading data.
+            num_frames (int, optional):
+                Maximum number of frames to read. ``-1`` reads all the remaining samples,
+                starting from ``frame_offset``.
+                This function may return the less number of frames if there is not enough
+                frames in the given file.
+            normalize (bool, optional):
+                When ``True``, this function converts the native sample type to ``float32``.
+                Default: ``True``.
+
+                If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+                integer type.
+                This argument has no effect for formats other than integer WAV type.
+
+            channels_first (bool, optional):
+                When True, the returned Tensor has dimension `[channel, time]`.
+                Otherwise, the returned Tensor's dimension is `[time, channel]`.
+
+            format (str or None, optional):
+                If not ``None``, interpreted as hint that may allow backend to override the detected format.
+                (Default: ``None``)
+
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend being available. (Default: ``None``)
+
+                .. seealso::
+                   :ref:`backend`
+
+        Returns:
+            (torch.Tensor, int): Resulting Tensor and sample rate.
+                If the input file has integer wav format and normalization is off, then it has
+                integer type, else ``float32`` type. If ``channels_first=True``, it has
+                `[channel, time]` else `[time, channel]`.
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
+
+    return load
+
+
+def get_save_func():
+    backends = get_available_backends()
+
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+
+        for backend in backends.values():
+            if backend.can_encode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ):
+        """Save audio data to file.
+
+        Note:
+            The formats this function can handle depend on the availability of backends.
+            Please use the following functions to fetch the supported formats.
+
+            - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders`
+            - Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats`
+            - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
+
+        Args:
+            uri (str or pathlib.Path): Path to audio file.
+            src (torch.Tensor): Audio data to save. must be 2D tensor.
+            sample_rate (int): sampling rate
+            channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+                otherwise `[time, channel]`.
+            format (str or None, optional): Override the audio format.
+                When ``uri`` argument is path-like object, audio format is
+                inferred from file extension. If the file extension is missing or
+                different, you can specify the correct format with this argument.
+
+                When ``uri`` argument is file-like object,
+                this argument is required.
+
+                Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``.
+            encoding (str or None, optional): Changes the encoding for supported formats.
+                This argument is effective only for supported formats, i.e.
+                ``"wav"`` and ``""flac"```. Valid values are
+
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+
+            bits_per_sample (int or None, optional): Changes the bit depth for the
+                supported formats.
+                When ``format`` is one of ``"wav"`` and ``"flac"``,
+                you can change the bit depth.
+                Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
+
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend being available.
+                (Default: ``None``)
+
+                .. seealso::
+                   :ref:`backend`
+
+            compression (CodecConfig, float, int, or None, optional):
+                Compression configuration to apply.
+
+                If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided.
+
+                Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the
+                ``sox`` command line interface must be provided. For instance:
+
+                ``"mp3"``
+                    Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
+                    VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
+
+                ``"flac"``
+                    Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
+
+                ``"ogg"``, ``"vorbis"``
+                    Number from ``-1`` to ``10``; ``-1`` is the highest compression
+                    and lowest quality. Default: ``3``.
+
+                Refer to http://sox.sourceforge.net/soxformat.html for more details.
+
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.save(
+            uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression
+        )
+
+    return save
diff --git a/MLPY/Lib/site-packages/torchaudio/_extension/__init__.py b/MLPY/Lib/site-packages/torchaudio/_extension/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa6527c9032d415cf9120ce4f1ca4547e7c97fee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_extension/__init__.py
@@ -0,0 +1,74 @@
+import logging
+import os
+import sys
+
+from torchaudio._internal.module_utils import fail_with_message, is_module_available, no_op
+
+from .utils import _check_cuda_version, _init_dll_path, _init_sox, _LazyImporter, _load_lib
+
+_LG = logging.getLogger(__name__)
+
+
+# Note:
+# `_check_cuda_version` is not meant to be used by regular users.
+# Builder uses it for debugging purpose, so we export it.
+# https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
+__all__ = [
+    "_check_cuda_version",
+    "_IS_TORCHAUDIO_EXT_AVAILABLE",
+    "_IS_RIR_AVAILABLE",
+    "lazy_import_sox_ext",
+]
+
+
+if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
+    _init_dll_path()
+
+
+# When the extension module is built, we initialize it.
+# In case of an error, we do not catch the failure as it suggests there is something
+# wrong with the installation.
+_IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
+# RIR features are implemented in _torchaudio extension, but they can be individually
+# turned on/off at build time. Available means that _torchaudio is loaded properly, and
+# RIR features are found there.
+_IS_RIR_AVAILABLE = False
+_IS_ALIGN_AVAILABLE = False
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _load_lib("libtorchaudio")
+
+    import torchaudio.lib._torchaudio  # noqa
+
+    _check_cuda_version()
+    _IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
+    _IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
+
+
+_SOX_EXT = None
+
+
+def lazy_import_sox_ext():
+    """Load SoX integration based on availability in lazy manner"""
+
+    global _SOX_EXT
+    if _SOX_EXT is None:
+        _SOX_EXT = _LazyImporter("_torchaudio_sox", _init_sox)
+    return _SOX_EXT
+
+
+fail_if_no_rir = (
+    no_op
+    if _IS_RIR_AVAILABLE
+    else fail_with_message(
+        "requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
+    )
+)
+
+fail_if_no_align = (
+    no_op
+    if _IS_ALIGN_AVAILABLE
+    else fail_with_message(
+        "Requires alignment extension, but TorchAudio is not compiled with it. \
+        Please build TorchAudio with alignment support."
+    )
+)
diff --git a/MLPY/Lib/site-packages/torchaudio/_extension/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_extension/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f2694dec5c54d3fde71076bb03f279a64cbd889
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_extension/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_extension/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_extension/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f49ebe958dd7b9acb2bb2806411662fa3f978a4f
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_extension/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_extension/utils.py b/MLPY/Lib/site-packages/torchaudio/_extension/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddabf8e6711bbd809e9f3b8dab34028b4ac6c53d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_extension/utils.py
@@ -0,0 +1,180 @@
+"""Module to implement logics used for initializing extensions.
+
+The implementations here should be stateless.
+They should not depend on external state.
+Anything that depends on external state should happen in __init__.py
+"""
+import importlib
+import logging
+import os
+import types
+from pathlib import Path
+
+import torch
+from torchaudio._internal.module_utils import eval_env
+
+_LG = logging.getLogger(__name__)
+_LIB_DIR = Path(__file__).parent.parent / "lib"
+
+
+def _get_lib_path(lib: str):
+    suffix = "pyd" if os.name == "nt" else "so"
+    path = _LIB_DIR / f"{lib}.{suffix}"
+    return path
+
+
+def _load_lib(lib: str) -> bool:
+    """Load extension module
+
+    Note:
+        In case `torchaudio` is deployed with `pex` format, the library file
+        is not in a standard location.
+        In this case, we expect that `libtorchaudio` is available somewhere
+        in the search path of dynamic loading mechanism, so that importing
+        `_torchaudio` will have library loader find and load `libtorchaudio`.
+        This is the reason why the function should not raising an error when the library
+        file is not found.
+
+    Returns:
+        bool:
+            True if the library file is found AND the library loaded without failure.
+            False if the library file is not found (like in the case where torchaudio
+            is deployed with pex format, thus the shared library file is
+            in a non-standard location.).
+            If the library file is found but there is an issue loading the library,
+            (such as missing dependency) then this function raises the exception as-is.
+
+    Raises:
+        Exception:
+            If the library file is found, but there is an issue loading the library file,
+            (when underlying `ctype.DLL` throws an exception), this function will pass
+            the exception as-is, instead of catching it and returning bool.
+            The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
+            is not found.
+            This behavior was chosen because the expected failure case is not recoverable.
+            If a dependency is missing, then users have to install it.
+    """
+    path = _get_lib_path(lib)
+    if not path.exists():
+        return False
+    torch.ops.load_library(path)
+    return True
+
+
+def _import_sox_ext():
+    if os.name == "nt":
+        raise RuntimeError("sox extension is not supported on Windows")
+    if not eval_env("TORCHAUDIO_USE_SOX", True):
+        raise RuntimeError("sox extension is disabled. (TORCHAUDIO_USE_SOX=0)")
+
+    ext = "torchaudio.lib._torchaudio_sox"
+
+    if not importlib.util.find_spec(ext):
+        raise RuntimeError(
+            # fmt: off
+            "TorchAudio is not built with sox extension. "
+            "Please build TorchAudio with libsox support. (BUILD_SOX=1)"
+            # fmt: on
+        )
+
+    _load_lib("libtorchaudio_sox")
+    return importlib.import_module(ext)
+
+
+def _init_sox():
+    ext = _import_sox_ext()
+    ext.set_verbosity(0)
+
+    import atexit
+
+    torch.ops.torchaudio_sox.initialize_sox_effects()
+    atexit.register(torch.ops.torchaudio_sox.shutdown_sox_effects)
+
+    # Bundle functions registered with TORCH_LIBRARY into extension
+    # so that they can also be accessed in the same (lazy) manner
+    # from the extension.
+    keys = [
+        "get_info",
+        "load_audio_file",
+        "save_audio_file",
+        "apply_effects_tensor",
+        "apply_effects_file",
+    ]
+    for key in keys:
+        setattr(ext, key, getattr(torch.ops.torchaudio_sox, key))
+
+    return ext
+
+
+class _LazyImporter(types.ModuleType):
+    """Lazily import module/extension."""
+
+    def __init__(self, name, import_func):
+        super().__init__(name)
+        self.import_func = import_func
+        self.module = None
+
+    # Note:
+    # Python caches what was retrieved with `__getattr__`, so this method will not be
+    # called again for the same item.
+    def __getattr__(self, item):
+        self._import_once()
+        return getattr(self.module, item)
+
+    def __repr__(self):
+        if self.module is None:
+            return f"<module '{self.__module__}.{self.__class__.__name__}(\"{self.name}\")'>"
+        return repr(self.module)
+
+    def __dir__(self):
+        self._import_once()
+        return dir(self.module)
+
+    def _import_once(self):
+        if self.module is None:
+            self.module = self.import_func()
+            # Note:
+            # By attaching the module attributes to self,
+            # module attributes are directly accessible.
+            # This allows to avoid calling __getattr__ for every attribute access.
+            self.__dict__.update(self.module.__dict__)
+
+    def is_available(self):
+        try:
+            self._import_once()
+        except Exception:
+            return False
+        return True
+
+
+def _init_dll_path():
+    # On Windows Python-3.8+ has `os.add_dll_directory` call,
+    # which is called to configure dll search path.
+    # To find cuda related dlls we need to make sure the
+    # conda environment/bin path is configured Please take a look:
+    # https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python
+    # Please note: if some path can't be added using add_dll_directory we simply ignore this path
+    for path in os.environ.get("PATH", "").split(";"):
+        if os.path.exists(path):
+            try:
+                os.add_dll_directory(path)
+            except Exception:
+                pass
+
+
+def _check_cuda_version():
+    import torchaudio.lib._torchaudio
+
+    version = torchaudio.lib._torchaudio.cuda_version()
+    if version is not None and torch.version.cuda is not None:
+        version_str = str(version)
+        ta_version = f"{version_str[:-3]}.{version_str[-2]}"
+        t_version = torch.version.cuda.split(".")
+        t_version = f"{t_version[0]}.{t_version[1]}"
+        if ta_version != t_version:
+            raise RuntimeError(
+                "Detected that PyTorch and TorchAudio were compiled with different CUDA versions. "
+                f"PyTorch has CUDA version {t_version} whereas TorchAudio has CUDA version {ta_version}. "
+                "Please install the TorchAudio version that matches your PyTorch version."
+            )
+    return version
diff --git a/MLPY/Lib/site-packages/torchaudio/_internal/__init__.py b/MLPY/Lib/site-packages/torchaudio/_internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..236213dbfe4779e181bf280f98dc84f71278b43e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_internal/__init__.py
@@ -0,0 +1,10 @@
+try:
+    from .fb import download_url_to_file, load_state_dict_from_url
+except ImportError:
+    from torch.hub import download_url_to_file, load_state_dict_from_url
+
+
+__all__ = [
+    "load_state_dict_from_url",
+    "download_url_to_file",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/_internal/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_internal/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10b528e9340d0d8bbb06c53fb517c5cc06f855b2
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_internal/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_internal/__pycache__/module_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/_internal/__pycache__/module_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5bf4ec2936a30d27de68b05ade39ee2a104af81
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/_internal/__pycache__/module_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/_internal/module_utils.py b/MLPY/Lib/site-packages/torchaudio/_internal/module_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..55f700140ef8fb49181c08b8c59ff8c5f0019f43
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/_internal/module_utils.py
@@ -0,0 +1,113 @@
+import importlib.util
+import os
+import warnings
+from functools import wraps
+from typing import Optional
+
+
+def eval_env(var, default):
+    """Check if environment varable has True-y value"""
+    if var not in os.environ:
+        return default
+
+    val = os.environ.get(var, "0")
+    trues = ["1", "true", "TRUE", "on", "ON", "yes", "YES"]
+    falses = ["0", "false", "FALSE", "off", "OFF", "no", "NO"]
+    if val in trues:
+        return True
+    if val not in falses:
+        # fmt: off
+        raise RuntimeError(
+            f"Unexpected environment variable value `{var}={val}`. "
+            f"Expected one of {trues + falses}")
+        # fmt: on
+    return False
+
+
+def is_module_available(*modules: str) -> bool:
+    r"""Returns if a top-level module with :attr:`name` exists *without**
+    importing it. This is generally safer than try-catch block around a
+    `import X`. It avoids third party libraries breaking assumptions of some of
+    our tests, e.g., setting multiprocessing start method when imported
+    (see librosa/#747, torchvision/#544).
+    """
+    return all(importlib.util.find_spec(m) is not None for m in modules)
+
+
+def requires_module(*modules: str):
+    """Decorate function to give error message if invoked without required optional modules.
+
+    This decorator is to give better error message to users rather
+    than raising ``NameError:  name 'module' is not defined`` at random places.
+    """
+    missing = [m for m in modules if not is_module_available(m)]
+
+    if not missing:
+        # fall through. If all the modules are available, no need to decorate
+        def decorator(func):
+            return func
+
+    else:
+        req = f"module: {missing[0]}" if len(missing) == 1 else f"modules: {missing}"
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(f"{func.__module__}.{func.__name__} requires {req}")
+
+            return wrapped
+
+    return decorator
+
+
+def deprecated(direction: str, version: Optional[str] = None, remove: bool = False):
+    """Decorator to add deprecation message
+
+    Args:
+        direction (str): Migration steps to be given to users.
+        version (str or int): The version when the object will be removed
+        remove (bool): If enabled, append future removal message.
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+            message = f"{func.__module__}.{func.__name__} has been deprecated. {direction}"
+            if remove:
+                message += f' It will be removed from {"future" if version is None else version} release. '
+            warnings.warn(message, stacklevel=2)
+            return func(*args, **kwargs)
+
+        message = "This function has been deprecated. "
+        if remove:
+            message += f'It will be removed from {"future" if version is None else version} release. '
+
+        wrapped.__doc__ = f"""DEPRECATED: {func.__doc__}
+
+    .. warning::
+
+       {message}
+       {direction}
+        """
+
+        return wrapped
+
+    return decorator
+
+
+def fail_with_message(message):
+    """Generate decorator to give users message about missing TorchAudio extension."""
+
+    def decorator(func):
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+            raise RuntimeError(f"{func.__module__}.{func.__name__} {message}")
+
+        return wrapped
+
+    return decorator
+
+
+def no_op(func):
+    """Op-op decorator. Used in place of fail_with_message when a functionality that requires extension works fine."""
+    return func
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/__init__.py b/MLPY/Lib/site-packages/torchaudio/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f4457cf20dbb730096b0a205f923175f38725d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/backend/__init__.py
@@ -0,0 +1,8 @@
+# NOTE:
+# The entire `torchaudio.backend` module is deprecated.
+# New things should be added to `torchaudio._backend`.
+# Only things related to backward compatibility should be placed here.
+
+from . import common, no_backend, soundfile_backend, sox_io_backend  # noqa
+
+__all__ = []
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba98963a5ee35f1db69260ee1f8f902e9437059e
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/_no_backend.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/_no_backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1698be769f55b1284fc832f8b7175aa3fa3384a2
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/_no_backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf401467978f8c8e62e95d46bf30f6ccfcdae205
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/common.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96f9a9c07413a6f2b033ea224889aab127b0da55
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/no_backend.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/no_backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17353f28ea8c9f086d064c8d15955ce6daaefb3b
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/no_backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e0bbeefb582a8d943aefc7459d671cf1a01c138
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/sox_io_backend.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/sox_io_backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a40c890f49743770c4b89e04f513b05e89c8227
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/backend/__pycache__/sox_io_backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/_no_backend.py b/MLPY/Lib/site-packages/torchaudio/backend/_no_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..614931bf244695347f356e12ecb4e399c27644ac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/backend/_no_backend.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+from typing import Callable, Optional, Tuple, Union
+
+from torch import Tensor
+from torchaudio import AudioMetaData
+
+
+def load(
+    filepath: Union[str, Path],
+    out: Optional[Tensor] = None,
+    normalization: Union[bool, float, Callable] = True,
+    channels_first: bool = True,
+    num_frames: int = 0,
+    offset: int = 0,
+    filetype: Optional[str] = None,
+) -> Tuple[Tensor, int]:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def info(filepath: str) -> AudioMetaData:
+    raise RuntimeError("No audio I/O backend is available.")
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/_sox_io_backend.py b/MLPY/Lib/site-packages/torchaudio/backend/_sox_io_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fad2d9c72bfe2d0204405aea709fe801bb66921
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/backend/_sox_io_backend.py
@@ -0,0 +1,294 @@
+import os
+from typing import Optional, Tuple
+
+import torch
+import torchaudio
+from torchaudio import AudioMetaData
+
+sox_ext = torchaudio._extension.lazy_import_sox_ext()
+
+
+def info(
+    filepath: str,
+    format: Optional[str] = None,
+) -> AudioMetaData:
+    """Get signal information of an audio file.
+
+    Args:
+        filepath (str):
+            Source of audio data.
+
+        format (str or None, optional):
+            Override the format detection with the given format.
+            Providing the argument might help when libsox can not infer the format
+            from header or extension.
+
+    Returns:
+        AudioMetaData: Metadata of the given audio.
+    """
+    if not torch.jit.is_scripting():
+        if hasattr(filepath, "read"):
+            raise RuntimeError("sox_io backend does not support file-like object.")
+        filepath = os.fspath(filepath)
+    sinfo = sox_ext.get_info(filepath, format)
+    return AudioMetaData(*sinfo)
+
+
+def load(
+    filepath: str,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Load audio data from file.
+
+    Note:
+        This function can handle all the codecs that underlying libsox can handle,
+        however it is tested on the following formats;
+
+        * WAV, AMB
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 24-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer (WAV only)
+
+        * MP3
+        * FLAC
+        * OGG/VORBIS
+        * OPUS
+        * SPHERE
+        * AMR-NB
+
+        To load ``MP3``, ``FLAC``, ``OGG/VORBIS``, ``OPUS`` and other codecs ``libsox`` does not
+        handle natively, your installation of ``torchaudio`` has to be linked to ``libsox``
+        and corresponding codec libraries such as ``libmad`` or ``libmp3lame`` etc.
+
+    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+    ``float32`` dtype, and the shape of `[channel, time]`.
+
+    .. warning::
+
+       ``normalize`` argument does not perform volume normalization.
+       It only converts the sample type to `torch.float32` from the native sample
+       type.
+
+       When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+       signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
+       this function can return integer Tensor, where the samples are expressed within the whole range
+       of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
+       ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
+       support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
+
+       ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
+       ``flac`` and ``mp3``.
+
+       For these formats, this function always returns ``float32`` Tensor with values.
+
+    Args:
+        filepath (path-like object): Source of audio data.
+        frame_offset (int):
+            Number of frames to skip before start reading data.
+        num_frames (int, optional):
+            Maximum number of frames to read. ``-1`` reads all the remaining samples,
+            starting from ``frame_offset``.
+            This function may return the less number of frames if there is not enough
+            frames in the given file.
+        normalize (bool, optional):
+            When ``True``, this function converts the native sample type to ``float32``.
+            Default: ``True``.
+
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type.
+            This argument has no effect for formats other than integer WAV type.
+
+        channels_first (bool, optional):
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Override the format detection with the given format.
+            Providing the argument might help when libsox can not infer the format
+            from header or extension.
+
+    Returns:
+        (torch.Tensor, int): Resulting Tensor and sample rate.
+            If the input file has integer wav format and ``normalize=False``, then it has
+            integer type, else ``float32`` type. If ``channels_first=True``, it has
+            `[channel, time]` else `[time, channel]`.
+    """
+    if not torch.jit.is_scripting():
+        if hasattr(filepath, "read"):
+            raise RuntimeError("sox_io backend does not support file-like object.")
+        filepath = os.fspath(filepath)
+    return sox_ext.load_audio_file(filepath, frame_offset, num_frames, normalize, channels_first, format)
+
+
+def save(
+    filepath: str,
+    src: torch.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+):
+    """Save audio data to file.
+
+    Args:
+        filepath (path-like object): Path to save file.
+        src (torch.Tensor): Audio data to save. must be 2D tensor.
+        sample_rate (int): sampling rate
+        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+            otherwise `[time, channel]`.
+        compression (float or None, optional): Used for formats other than WAV.
+            This corresponds to ``-C`` option of ``sox`` command.
+
+            ``"mp3"``
+                Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
+                VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
+
+            ``"flac"``
+                Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
+
+            ``"ogg"``, ``"vorbis"``
+                Number from ``-1`` to ``10``; ``-1`` is the highest compression
+                and lowest quality. Default: ``3``.
+
+            See the detail at http://sox.sourceforge.net/soxformat.html.
+        format (str or None, optional): Override the audio format.
+            When ``filepath`` argument is path-like object, audio format is infered from
+            file extension. If file extension is missing or different, you can specify the
+            correct format with this argument.
+
+            When ``filepath`` argument is file-like object, this argument is required.
+
+            Valid values are ``"wav"``, ``"mp3"``, ``"ogg"``, ``"vorbis"``, ``"amr-nb"``,
+            ``"amb"``, ``"flac"``, ``"sph"``, ``"gsm"``, and ``"htk"``.
+
+        encoding (str or None, optional): Changes the encoding for the supported formats.
+            This argument is effective only for supported formats, such as ``"wav"``, ``""amb"``
+            and ``"sph"``. Valid values are;
+
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+
+            Default values
+                If not provided, the default value is picked based on ``format`` and ``bits_per_sample``.
+
+                ``"wav"``, ``"amb"``
+                    - | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
+                      | Tensor is used to determine the default value.
+
+                        - ``"PCM_U"`` if dtype is ``uint8``
+                        - ``"PCM_S"`` if dtype is ``int16`` or ``int32``
+                        - ``"PCM_F"`` if dtype is ``float32``
+
+                    - ``"PCM_U"`` if ``bits_per_sample=8``
+                    - ``"PCM_S"`` otherwise
+
+                ``"sph"`` format;
+                    - the default value is ``"PCM_S"``
+
+        bits_per_sample (int or None, optional): Changes the bit depth for the supported formats.
+            When ``format`` is one of ``"wav"``, ``"flac"``, ``"sph"``, or ``"amb"``, you can change the
+            bit depth. Valid values are ``8``, ``16``, ``32`` and ``64``.
+
+            Default Value;
+                If not provided, the default values are picked based on ``format`` and ``"encoding"``;
+
+                ``"wav"``, ``"amb"``;
+                    - | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
+                      | Tensor is used.
+
+                        - ``8`` if dtype is ``uint8``
+                        - ``16`` if dtype is ``int16``
+                        - ``32`` if dtype is  ``int32`` or ``float32``
+
+                    - ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
+                    - ``16`` if ``encoding`` is ``"PCM_S"``
+                    - ``32`` if ``encoding`` is ``"PCM_F"``
+
+                ``"flac"`` format;
+                    - the default value is ``24``
+
+                ``"sph"`` format;
+                    - ``16`` if ``encoding`` is ``"PCM_U"``, ``"PCM_S"``, ``"PCM_F"`` or not provided.
+                    - ``8`` if ``encoding`` is ``"ULAW"`` or ``"ALAW"``
+
+                ``"amb"`` format;
+                    - ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
+                    - ``16`` if ``encoding`` is ``"PCM_S"`` or not provided.
+                    - ``32`` if ``encoding`` is ``"PCM_F"``
+
+    Supported formats/encodings/bit depth/compression are;
+
+    ``"wav"``, ``"amb"``
+        - 32-bit floating-point PCM
+        - 32-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 8-bit unsigned integer PCM
+        - 8-bit mu-law
+        - 8-bit a-law
+
+        Note: Default encoding/bit depth is determined by the dtype of the input Tensor.
+
+    ``"mp3"``
+        Fixed bit rate (such as 128kHz) and variable bit rate compression.
+        Default: VBR with high quality.
+
+    ``"flac"``
+        - 8-bit
+        - 16-bit
+        - 24-bit (default)
+
+    ``"ogg"``, ``"vorbis"``
+        - Different quality level. Default: approx. 112kbps
+
+    ``"sph"``
+        - 8-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 32-bit signed integer PCM (default)
+        - 8-bit mu-law
+        - 8-bit a-law
+        - 16-bit a-law
+        - 24-bit a-law
+        - 32-bit a-law
+
+    ``"amr-nb"``
+        Bitrate ranging from 4.75 kbit/s to 12.2 kbit/s. Default: 4.75 kbit/s
+
+    ``"gsm"``
+        Lossy Speech Compression, CPU intensive.
+
+    ``"htk"``
+        Uses a default single-channel 16-bit PCM format.
+
+    Note:
+        To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``,
+        ``"flac"``, ``"ogg"`` and ``"vorbis"``), your installation of ``torchaudio`` has
+        to be linked to ``libsox`` and corresponding codec libraries such as ``libmad``
+        or ``libmp3lame`` etc.
+    """
+    if not torch.jit.is_scripting():
+        if hasattr(filepath, "write"):
+            raise RuntimeError("sox_io backend does not handle file-like object.")
+        filepath = os.fspath(filepath)
+    sox_ext.save_audio_file(
+        filepath,
+        src,
+        sample_rate,
+        channels_first,
+        compression,
+        format,
+        encoding,
+        bits_per_sample,
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/common.py b/MLPY/Lib/site-packages/torchaudio/backend/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9baa664b75f34f69f1fc7825877327c72874bde
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/backend/common.py
@@ -0,0 +1,13 @@
+def __getattr__(name: str):
+    if name == "AudioMetaData":
+        import warnings
+
+        warnings.warn(
+            "`torchaudio.backend.common.AudioMetaData` has been moved to "
+            "`torchaudio.AudioMetaData`. Please update the import path.",
+            stacklevel=2,
+        )
+        from torchaudio import AudioMetaData
+
+        return AudioMetaData
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/no_backend.py b/MLPY/Lib/site-packages/torchaudio/backend/no_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..a36062886bd9b1dd22048aa67c722e1948f7aedf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/backend/no_backend.py
@@ -0,0 +1,14 @@
+def __getattr__(name: str):
+    import warnings
+
+    warnings.warn(
+        "Torchaudio's I/O functions now support par-call bakcend dispatch. "
+        "Importing backend implementation directly is no longer guaranteed to work. "
+        "Please use `backend` keyword with load/save/info function, instead of "
+        "calling the udnerlying implementation directly.",
+        stacklevel=2,
+    )
+
+    from . import _no_backend
+
+    return getattr(_no_backend, name)
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/soundfile_backend.py b/MLPY/Lib/site-packages/torchaudio/backend/soundfile_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..1885a9f407e98464ada2556e5e5c5b9065c7034f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/backend/soundfile_backend.py
@@ -0,0 +1,14 @@
+def __getattr__(name: str):
+    import warnings
+
+    warnings.warn(
+        "Torchaudio's I/O functions now support par-call bakcend dispatch. "
+        "Importing backend implementation directly is no longer guaranteed to work. "
+        "Please use `backend` keyword with load/save/info function, instead of "
+        "calling the udnerlying implementation directly.",
+        stacklevel=2,
+    )
+
+    from torchaudio._backend import soundfile_backend
+
+    return getattr(soundfile_backend, name)
diff --git a/MLPY/Lib/site-packages/torchaudio/backend/sox_io_backend.py b/MLPY/Lib/site-packages/torchaudio/backend/sox_io_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0ef3c05da6d3191f66d83d460d667f8fbe4ad1f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/backend/sox_io_backend.py
@@ -0,0 +1,14 @@
+def __getattr__(name: str):
+    import warnings
+
+    warnings.warn(
+        "Torchaudio's I/O functions now support par-call bakcend dispatch. "
+        "Importing backend implementation directly is no longer guaranteed to work. "
+        "Please use `backend` keyword with load/save/info function, instead of "
+        "calling the udnerlying implementation directly.",
+        stacklevel=2,
+    )
+
+    from . import _sox_io_backend
+
+    return getattr(_sox_io_backend, name)
diff --git a/MLPY/Lib/site-packages/torchaudio/compliance/__init__.py b/MLPY/Lib/site-packages/torchaudio/compliance/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..727272101fe623a7d9a5c45f600e94540780d40e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/compliance/__init__.py
@@ -0,0 +1,5 @@
+from . import kaldi
+
+__all__ = [
+    "kaldi",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/compliance/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/compliance/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6622aaba4ee1acf7c071a577a2a5cb731c0d34a8
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/compliance/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/compliance/__pycache__/kaldi.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/compliance/__pycache__/kaldi.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ab4d1a3362272c84bdaf3b519c14004a8be0093
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/compliance/__pycache__/kaldi.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/compliance/kaldi.py b/MLPY/Lib/site-packages/torchaudio/compliance/kaldi.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fe81036d138e0b351d213f28ae610a2ca50d2ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/compliance/kaldi.py
@@ -0,0 +1,813 @@
+import math
+from typing import Tuple
+
+import torch
+import torchaudio
+from torch import Tensor
+
+__all__ = [
+    "get_mel_banks",
+    "inverse_mel_scale",
+    "inverse_mel_scale_scalar",
+    "mel_scale",
+    "mel_scale_scalar",
+    "spectrogram",
+    "fbank",
+    "mfcc",
+    "vtln_warp_freq",
+    "vtln_warp_mel_freq",
+]
+
+# numeric_limits<float>::epsilon() 1.1920928955078125e-07
+EPSILON = torch.tensor(torch.finfo(torch.float).eps)
+# 1 milliseconds = 0.001 seconds
+MILLISECONDS_TO_SECONDS = 0.001
+
+# window types
+HAMMING = "hamming"
+HANNING = "hanning"
+POVEY = "povey"
+RECTANGULAR = "rectangular"
+BLACKMAN = "blackman"
+WINDOWS = [HAMMING, HANNING, POVEY, RECTANGULAR, BLACKMAN]
+
+
+def _get_epsilon(device, dtype):
+    return EPSILON.to(device=device, dtype=dtype)
+
+
+def _next_power_of_2(x: int) -> int:
+    r"""Returns the smallest power of 2 that is greater than x"""
+    return 1 if x == 0 else 2 ** (x - 1).bit_length()
+
+
+def _get_strided(waveform: Tensor, window_size: int, window_shift: int, snip_edges: bool) -> Tensor:
+    r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``)
+    representing how the window is shifted along the waveform. Each row is a frame.
+
+    Args:
+        waveform (Tensor): Tensor of size ``num_samples``
+        window_size (int): Frame length
+        window_shift (int): Frame shift
+        snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends.
+
+    Returns:
+        Tensor: 2D tensor of size (m, ``window_size``) where each row is a frame
+    """
+    assert waveform.dim() == 1
+    num_samples = waveform.size(0)
+    strides = (window_shift * waveform.stride(0), waveform.stride(0))
+
+    if snip_edges:
+        if num_samples < window_size:
+            return torch.empty((0, 0), dtype=waveform.dtype, device=waveform.device)
+        else:
+            m = 1 + (num_samples - window_size) // window_shift
+    else:
+        reversed_waveform = torch.flip(waveform, [0])
+        m = (num_samples + (window_shift // 2)) // window_shift
+        pad = window_size // 2 - window_shift // 2
+        pad_right = reversed_waveform
+        if pad > 0:
+            # torch.nn.functional.pad returns [2,1,0,1,2] for 'reflect'
+            # but we want [2, 1, 0, 0, 1, 2]
+            pad_left = reversed_waveform[-pad:]
+            waveform = torch.cat((pad_left, waveform, pad_right), dim=0)
+        else:
+            # pad is negative so we want to trim the waveform at the front
+            waveform = torch.cat((waveform[-pad:], pad_right), dim=0)
+
+    sizes = (m, window_size)
+    return waveform.as_strided(sizes, strides)
+
+
+def _feature_window_function(
+    window_type: str,
+    window_size: int,
+    blackman_coeff: float,
+    device: torch.device,
+    dtype: int,
+) -> Tensor:
+    r"""Returns a window function with the given type and size"""
+    if window_type == HANNING:
+        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype)
+    elif window_type == HAMMING:
+        return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46, device=device, dtype=dtype)
+    elif window_type == POVEY:
+        # like hanning but goes to zero at edges
+        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(0.85)
+    elif window_type == RECTANGULAR:
+        return torch.ones(window_size, device=device, dtype=dtype)
+    elif window_type == BLACKMAN:
+        a = 2 * math.pi / (window_size - 1)
+        window_function = torch.arange(window_size, device=device, dtype=dtype)
+        # can't use torch.blackman_window as they use different coefficients
+        return (
+            blackman_coeff
+            - 0.5 * torch.cos(a * window_function)
+            + (0.5 - blackman_coeff) * torch.cos(2 * a * window_function)
+        ).to(device=device, dtype=dtype)
+    else:
+        raise Exception("Invalid window type " + window_type)
+
+
+def _get_log_energy(strided_input: Tensor, epsilon: Tensor, energy_floor: float) -> Tensor:
+    r"""Returns the log energy of size (m) for a strided_input (m,*)"""
+    device, dtype = strided_input.device, strided_input.dtype
+    log_energy = torch.max(strided_input.pow(2).sum(1), epsilon).log()  # size (m)
+    if energy_floor == 0.0:
+        return log_energy
+    return torch.max(log_energy, torch.tensor(math.log(energy_floor), device=device, dtype=dtype))
+
+
+def _get_waveform_and_window_properties(
+    waveform: Tensor,
+    channel: int,
+    sample_frequency: float,
+    frame_shift: float,
+    frame_length: float,
+    round_to_power_of_two: bool,
+    preemphasis_coefficient: float,
+) -> Tuple[Tensor, int, int, int]:
+    r"""Gets the waveform and window properties"""
+    channel = max(channel, 0)
+    assert channel < waveform.size(0), "Invalid channel {} for size {}".format(channel, waveform.size(0))
+    waveform = waveform[channel, :]  # size (n)
+    window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS)
+    window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS)
+    padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size
+
+    assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format(
+        window_size, len(waveform)
+    )
+    assert 0 < window_shift, "`window_shift` must be greater than 0"
+    assert padded_window_size % 2 == 0, (
+        "the padded `window_size` must be divisible by two." " use `round_to_power_of_two` or change `frame_length`"
+    )
+    assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
+    assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
+    return waveform, window_shift, window_size, padded_window_size
+
+
+def _get_window(
+    waveform: Tensor,
+    padded_window_size: int,
+    window_size: int,
+    window_shift: int,
+    window_type: str,
+    blackman_coeff: float,
+    snip_edges: bool,
+    raw_energy: bool,
+    energy_floor: float,
+    dither: float,
+    remove_dc_offset: bool,
+    preemphasis_coefficient: float,
+) -> Tuple[Tensor, Tensor]:
+    r"""Gets a window and its log energy
+
+    Returns:
+        (Tensor, Tensor): strided_input of size (m, ``padded_window_size``) and signal_log_energy of size (m)
+    """
+    device, dtype = waveform.device, waveform.dtype
+    epsilon = _get_epsilon(device, dtype)
+
+    # size (m, window_size)
+    strided_input = _get_strided(waveform, window_size, window_shift, snip_edges)
+
+    if dither != 0.0:
+        rand_gauss = torch.randn(strided_input.shape, device=device, dtype=dtype)
+        strided_input = strided_input + rand_gauss * dither
+
+    if remove_dc_offset:
+        # Subtract each row/frame by its mean
+        row_means = torch.mean(strided_input, dim=1).unsqueeze(1)  # size (m, 1)
+        strided_input = strided_input - row_means
+
+    if raw_energy:
+        # Compute the log energy of each row/frame before applying preemphasis and
+        # window function
+        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)  # size (m)
+
+    if preemphasis_coefficient != 0.0:
+        # strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j
+        offset_strided_input = torch.nn.functional.pad(strided_input.unsqueeze(0), (1, 0), mode="replicate").squeeze(
+            0
+        )  # size (m, window_size + 1)
+        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :-1]
+
+    # Apply window_function to each row/frame
+    window_function = _feature_window_function(window_type, window_size, blackman_coeff, device, dtype).unsqueeze(
+        0
+    )  # size (1, window_size)
+    strided_input = strided_input * window_function  # size (m, window_size)
+
+    # Pad columns with zero until we reach size (m, padded_window_size)
+    if padded_window_size != window_size:
+        padding_right = padded_window_size - window_size
+        strided_input = torch.nn.functional.pad(
+            strided_input.unsqueeze(0), (0, padding_right), mode="constant", value=0
+        ).squeeze(0)
+
+    # Compute energy after window function (not the raw one)
+    if not raw_energy:
+        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)  # size (m)
+
+    return strided_input, signal_log_energy
+
+
+def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
+    # subtracts the column mean of the tensor size (m, n) if subtract_mean=True
+    # it returns size (m, n)
+    if subtract_mean:
+        col_means = torch.mean(tensor, dim=0).unsqueeze(0)
+        tensor = tensor - col_means
+    return tensor
+
+
+def spectrogram(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    min_duration: float = 0.0,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
+    compute-spectrogram-feats.
+
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``'povey'``)
+
+    Returns:
+        Tensor: A spectrogram identical to what Kaldi would output. The shape is
+        (m, ``padded_window_size // 2 + 1``) where m is calculated in _get_strided
+    """
+    device, dtype = waveform.device, waveform.dtype
+    epsilon = _get_epsilon(device, dtype)
+
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
+    )
+
+    if len(waveform) < min_duration * sample_frequency:
+        # signal is too short
+        return torch.empty(0)
+
+    strided_input, signal_log_energy = _get_window(
+        waveform,
+        padded_window_size,
+        window_size,
+        window_shift,
+        window_type,
+        blackman_coeff,
+        snip_edges,
+        raw_energy,
+        energy_floor,
+        dither,
+        remove_dc_offset,
+        preemphasis_coefficient,
+    )
+
+    # size (m, padded_window_size // 2 + 1, 2)
+    fft = torch.fft.rfft(strided_input)
+
+    # Convert the FFT into a power spectrum
+    power_spectrum = torch.max(fft.abs().pow(2.0), epsilon).log()  # size (m, padded_window_size // 2 + 1)
+    power_spectrum[:, 0] = signal_log_energy
+
+    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
+    return power_spectrum
+
+
+def inverse_mel_scale_scalar(mel_freq: float) -> float:
+    return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
+
+
+def inverse_mel_scale(mel_freq: Tensor) -> Tensor:
+    return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
+
+
+def mel_scale_scalar(freq: float) -> float:
+    return 1127.0 * math.log(1.0 + freq / 700.0)
+
+
+def mel_scale(freq: Tensor) -> Tensor:
+    return 1127.0 * (1.0 + freq / 700.0).log()
+
+
+def vtln_warp_freq(
+    vtln_low_cutoff: float,
+    vtln_high_cutoff: float,
+    low_freq: float,
+    high_freq: float,
+    vtln_warp_factor: float,
+    freq: Tensor,
+) -> Tensor:
+    r"""This computes a VTLN warping function that is not the same as HTK's one,
+    but has similar inputs (this function has the advantage of never producing
+    empty bins).
+
+    This function computes a warp function F(freq), defined between low_freq
+    and high_freq inclusive, with the following properties:
+        F(low_freq) == low_freq
+        F(high_freq) == high_freq
+    The function is continuous and piecewise linear with two inflection
+        points.
+    The lower inflection point (measured in terms of the unwarped
+        frequency) is at frequency l, determined as described below.
+    The higher inflection point is at a frequency h, determined as
+        described below.
+    If l <= f <= h, then F(f) = f/vtln_warp_factor.
+    If the higher inflection point (measured in terms of the unwarped
+        frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
+        Since (by the last point) F(h) == h/vtln_warp_factor, then
+        max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
+        h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
+          = vtln_high_cutoff * min(1, vtln_warp_factor).
+    If the lower inflection point (measured in terms of the unwarped
+        frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
+        This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
+                            = vtln_low_cutoff * max(1, vtln_warp_factor)
+    Args:
+        vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
+        vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
+        low_freq (float): Lower frequency cutoffs in mel computation
+        high_freq (float): Upper frequency cutoffs in mel computation
+        vtln_warp_factor (float): Vtln warp factor
+        freq (Tensor): given frequency in Hz
+
+    Returns:
+        Tensor: Freq after vtln warp
+    """
+    assert vtln_low_cutoff > low_freq, "be sure to set the vtln_low option higher than low_freq"
+    assert vtln_high_cutoff < high_freq, "be sure to set the vtln_high option lower than high_freq [or negative]"
+    l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
+    h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
+    scale = 1.0 / vtln_warp_factor
+    Fl = scale * l  # F(l)
+    Fh = scale * h  # F(h)
+    assert l > low_freq and h < high_freq
+    # slope of left part of the 3-piece linear function
+    scale_left = (Fl - low_freq) / (l - low_freq)
+    # [slope of center part is just "scale"]
+
+    # slope of right part of the 3-piece linear function
+    scale_right = (high_freq - Fh) / (high_freq - h)
+
+    res = torch.empty_like(freq)
+
+    outside_low_high_freq = torch.lt(freq, low_freq) | torch.gt(freq, high_freq)  # freq < low_freq || freq > high_freq
+    before_l = torch.lt(freq, l)  # freq < l
+    before_h = torch.lt(freq, h)  # freq < h
+    after_h = torch.ge(freq, h)  # freq >= h
+
+    # order of operations matter here (since there is overlapping frequency regions)
+    res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
+    res[before_h] = scale * freq[before_h]
+    res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
+    res[outside_low_high_freq] = freq[outside_low_high_freq]
+
+    return res
+
+
+def vtln_warp_mel_freq(
+    vtln_low_cutoff: float,
+    vtln_high_cutoff: float,
+    low_freq,
+    high_freq: float,
+    vtln_warp_factor: float,
+    mel_freq: Tensor,
+) -> Tensor:
+    r"""
+    Args:
+        vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
+        vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
+        low_freq (float): Lower frequency cutoffs in mel computation
+        high_freq (float): Upper frequency cutoffs in mel computation
+        vtln_warp_factor (float): Vtln warp factor
+        mel_freq (Tensor): Given frequency in Mel
+
+    Returns:
+        Tensor: ``mel_freq`` after vtln warp
+    """
+    return mel_scale(
+        vtln_warp_freq(
+            vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, vtln_warp_factor, inverse_mel_scale(mel_freq)
+        )
+    )
+
+
+def get_mel_banks(
+    num_bins: int,
+    window_length_padded: int,
+    sample_freq: float,
+    low_freq: float,
+    high_freq: float,
+    vtln_low: float,
+    vtln_high: float,
+    vtln_warp_factor: float,
+) -> Tuple[Tensor, Tensor]:
+    """
+    Returns:
+        (Tensor, Tensor): The tuple consists of ``bins`` (which is
+        melbank of size (``num_bins``, ``num_fft_bins``)) and ``center_freqs`` (which is
+        center frequencies of bins of size (``num_bins``)).
+    """
+    assert num_bins > 3, "Must have at least 3 mel bins"
+    assert window_length_padded % 2 == 0
+    num_fft_bins = window_length_padded / 2
+    nyquist = 0.5 * sample_freq
+
+    if high_freq <= 0.0:
+        high_freq += nyquist
+
+    assert (
+        (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq)
+    ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
+
+    # fft-bin width [think of it as Nyquist-freq / half-window-length]
+    fft_bin_width = sample_freq / window_length_padded
+    mel_low_freq = mel_scale_scalar(low_freq)
+    mel_high_freq = mel_scale_scalar(high_freq)
+
+    # divide by num_bins+1 in next line because of end-effects where the bins
+    # spread out to the sides.
+    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
+
+    if vtln_high < 0.0:
+        vtln_high += nyquist
+
+    assert vtln_warp_factor == 1.0 or (
+        (low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
+    ), "Bad values in options: vtln-low {} and vtln-high {}, versus " "low-freq {} and high-freq {}".format(
+        vtln_low, vtln_high, low_freq, high_freq
+    )
+
+    bin = torch.arange(num_bins).unsqueeze(1)
+    left_mel = mel_low_freq + bin * mel_freq_delta  # size(num_bins, 1)
+    center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta  # size(num_bins, 1)
+    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # size(num_bins, 1)
+
+    if vtln_warp_factor != 1.0:
+        left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, left_mel)
+        center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, center_mel)
+        right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel)
+
+    center_freqs = inverse_mel_scale(center_mel)  # size (num_bins)
+    # size(1, num_fft_bins)
+    mel = mel_scale(fft_bin_width * torch.arange(num_fft_bins)).unsqueeze(0)
+
+    # size (num_bins, num_fft_bins)
+    up_slope = (mel - left_mel) / (center_mel - left_mel)
+    down_slope = (right_mel - mel) / (right_mel - center_mel)
+
+    if vtln_warp_factor == 1.0:
+        # left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
+        bins = torch.max(torch.zeros(1), torch.min(up_slope, down_slope))
+    else:
+        # warping can move the order of left_mel, center_mel, right_mel anywhere
+        bins = torch.zeros_like(up_slope)
+        up_idx = torch.gt(mel, left_mel) & torch.le(mel, center_mel)  # left_mel < mel <= center_mel
+        down_idx = torch.gt(mel, center_mel) & torch.lt(mel, right_mel)  # center_mel < mel < right_mel
+        bins[up_idx] = up_slope[up_idx]
+        bins[down_idx] = down_slope[down_idx]
+
+    return bins, center_freqs
+
+
+def fbank(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    high_freq: float = 0.0,
+    htk_compat: bool = False,
+    low_freq: float = 20.0,
+    min_duration: float = 0.0,
+    num_mel_bins: int = 23,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    use_energy: bool = False,
+    use_log_fbank: bool = True,
+    use_power: bool = True,
+    vtln_high: float = -500.0,
+    vtln_low: float = 100.0,
+    vtln_warp: float = 1.0,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
+    compute-fbank-feats.
+
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
+         (Default: ``0.0``)
+        htk_compat (bool, optional): If true, put energy last.  Warning: not sufficient to get HTK compatible features
+         (need to change other parameters). (Default: ``False``)
+        low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
+        use_log_fbank (bool, optional):If true, produce log-filterbank, else produce linear. (Default: ``True``)
+        use_power (bool, optional): If true, use power, else use magnitude. (Default: ``True``)
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
+            negative, offset from high-mel-freq (Default: ``-500.0``)
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
+        vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``'povey'``)
+
+    Returns:
+        Tensor: A fbank identical to what Kaldi would output. The shape is (m, ``num_mel_bins + use_energy``)
+        where m is calculated in _get_strided
+    """
+    device, dtype = waveform.device, waveform.dtype
+
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
+    )
+
+    if len(waveform) < min_duration * sample_frequency:
+        # signal is too short
+        return torch.empty(0, device=device, dtype=dtype)
+
+    # strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
+    strided_input, signal_log_energy = _get_window(
+        waveform,
+        padded_window_size,
+        window_size,
+        window_shift,
+        window_type,
+        blackman_coeff,
+        snip_edges,
+        raw_energy,
+        energy_floor,
+        dither,
+        remove_dc_offset,
+        preemphasis_coefficient,
+    )
+
+    # size (m, padded_window_size // 2 + 1)
+    spectrum = torch.fft.rfft(strided_input).abs()
+    if use_power:
+        spectrum = spectrum.pow(2.0)
+
+    # size (num_mel_bins, padded_window_size // 2)
+    mel_energies, _ = get_mel_banks(
+        num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp
+    )
+    mel_energies = mel_energies.to(device=device, dtype=dtype)
+
+    # pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
+    mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0)
+
+    # sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins)
+    mel_energies = torch.mm(spectrum, mel_energies.T)
+    if use_log_fbank:
+        # avoid log of zero (which should be prevented anyway by dithering)
+        mel_energies = torch.max(mel_energies, _get_epsilon(device, dtype)).log()
+
+    # if use_energy then add it as the last column for htk_compat == true else first column
+    if use_energy:
+        signal_log_energy = signal_log_energy.unsqueeze(1)  # size (m, 1)
+        # returns size (m, num_mel_bins + 1)
+        if htk_compat:
+            mel_energies = torch.cat((mel_energies, signal_log_energy), dim=1)
+        else:
+            mel_energies = torch.cat((signal_log_energy, mel_energies), dim=1)
+
+    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
+    return mel_energies
+
+
+def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor:
+    # returns a dct matrix of size (num_mel_bins, num_ceps)
+    # size (num_mel_bins, num_mel_bins)
+    dct_matrix = torchaudio.functional.create_dct(num_mel_bins, num_mel_bins, "ortho")
+    # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
+    # this would be the first column in the dct_matrix for torchaudio as it expects a
+    # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
+    # expects a left multiply e.g. dct_matrix * vector).
+    dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins))
+    dct_matrix = dct_matrix[:, :num_ceps]
+    return dct_matrix
+
+
+def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor:
+    # returns size (num_ceps)
+    # Compute liftering coefficients (scaling on cepstral coeffs)
+    # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
+    i = torch.arange(num_ceps)
+    return 1.0 + 0.5 * cepstral_lifter * torch.sin(math.pi * i / cepstral_lifter)
+
+
+def mfcc(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    cepstral_lifter: float = 22.0,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    high_freq: float = 0.0,
+    htk_compat: bool = False,
+    low_freq: float = 20.0,
+    num_ceps: int = 13,
+    min_duration: float = 0.0,
+    num_mel_bins: int = 23,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    use_energy: bool = False,
+    vtln_high: float = -500.0,
+    vtln_low: float = 100.0,
+    vtln_warp: float = 1.0,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's
+    compute-mfcc-feats.
+
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        cepstral_lifter (float, optional): Constant that controls scaling of MFCCs (Default: ``22.0``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
+         (Default: ``0.0``)
+        htk_compat (bool, optional): If true, put energy last.  Warning: not sufficient to get HTK compatible
+         features (need to change other parameters). (Default: ``False``)
+        low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
+        num_ceps (int, optional): Number of cepstra in MFCC computation (including C0) (Default: ``13``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
+            negative, offset from high-mel-freq (Default: ``-500.0``)
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
+        vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``"povey"``)
+
+    Returns:
+        Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``)
+        where m is calculated in _get_strided
+    """
+    assert num_ceps <= num_mel_bins, "num_ceps cannot be larger than num_mel_bins: %d vs %d" % (num_ceps, num_mel_bins)
+
+    device, dtype = waveform.device, waveform.dtype
+
+    # The mel_energies should not be squared (use_power=True), not have mean subtracted
+    # (subtract_mean=False), and use log (use_log_fbank=True).
+    # size (m, num_mel_bins + use_energy)
+    feature = fbank(
+        waveform=waveform,
+        blackman_coeff=blackman_coeff,
+        channel=channel,
+        dither=dither,
+        energy_floor=energy_floor,
+        frame_length=frame_length,
+        frame_shift=frame_shift,
+        high_freq=high_freq,
+        htk_compat=htk_compat,
+        low_freq=low_freq,
+        min_duration=min_duration,
+        num_mel_bins=num_mel_bins,
+        preemphasis_coefficient=preemphasis_coefficient,
+        raw_energy=raw_energy,
+        remove_dc_offset=remove_dc_offset,
+        round_to_power_of_two=round_to_power_of_two,
+        sample_frequency=sample_frequency,
+        snip_edges=snip_edges,
+        subtract_mean=False,
+        use_energy=use_energy,
+        use_log_fbank=True,
+        use_power=True,
+        vtln_high=vtln_high,
+        vtln_low=vtln_low,
+        vtln_warp=vtln_warp,
+        window_type=window_type,
+    )
+
+    if use_energy:
+        # size (m)
+        signal_log_energy = feature[:, num_mel_bins if htk_compat else 0]
+        # offset is 0 if htk_compat==True else 1
+        mel_offset = int(not htk_compat)
+        feature = feature[:, mel_offset : (num_mel_bins + mel_offset)]
+
+    # size (num_mel_bins, num_ceps)
+    dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).to(dtype=dtype, device=device)
+
+    # size (m, num_ceps)
+    feature = feature.matmul(dct_matrix)
+
+    if cepstral_lifter != 0.0:
+        # size (1, num_ceps)
+        lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(0)
+        feature *= lifter_coeffs.to(device=device, dtype=dtype)
+
+    # if use_energy then replace the last column for htk_compat == true else first column
+    if use_energy:
+        feature[:, 0] = signal_log_energy
+
+    if htk_compat:
+        energy = feature[:, 0].unsqueeze(1)  # size (m, 1)
+        feature = feature[:, 1:]  # size (m, num_ceps - 1)
+        if not use_energy:
+            # scale on C0 (actually removing a scale we previously added that's
+            # part of one common definition of the cosine transform.)
+            energy *= math.sqrt(2)
+
+        feature = torch.cat((feature, energy), dim=1)
+
+    feature = _subtract_column_mean(feature, subtract_mean)
+    return feature
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__init__.py b/MLPY/Lib/site-packages/torchaudio/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c1032cfd4dd4ec3ca3f204f4ccc7a0598c17349
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/__init__.py
@@ -0,0 +1,47 @@
+from .cmuarctic import CMUARCTIC
+from .cmudict import CMUDict
+from .commonvoice import COMMONVOICE
+from .dr_vctk import DR_VCTK
+from .fluentcommands import FluentSpeechCommands
+from .gtzan import GTZAN
+from .iemocap import IEMOCAP
+from .librilight_limited import LibriLightLimited
+from .librimix import LibriMix
+from .librispeech import LIBRISPEECH
+from .librispeech_biasing import LibriSpeechBiasing
+from .libritts import LIBRITTS
+from .ljspeech import LJSPEECH
+from .musdb_hq import MUSDB_HQ
+from .quesst14 import QUESST14
+from .snips import Snips
+from .speechcommands import SPEECHCOMMANDS
+from .tedlium import TEDLIUM
+from .vctk import VCTK_092
+from .voxceleb1 import VoxCeleb1Identification, VoxCeleb1Verification
+from .yesno import YESNO
+
+
+__all__ = [
+    "COMMONVOICE",
+    "LIBRISPEECH",
+    "LibriSpeechBiasing",
+    "LibriLightLimited",
+    "SPEECHCOMMANDS",
+    "VCTK_092",
+    "DR_VCTK",
+    "YESNO",
+    "LJSPEECH",
+    "GTZAN",
+    "CMUARCTIC",
+    "CMUDict",
+    "LibriMix",
+    "LIBRITTS",
+    "TEDLIUM",
+    "QUESST14",
+    "MUSDB_HQ",
+    "FluentSpeechCommands",
+    "VoxCeleb1Identification",
+    "VoxCeleb1Verification",
+    "IEMOCAP",
+    "Snips",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..072b9d78b45b0a2c8325fc97f87a736181bddb0a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/cmuarctic.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/cmuarctic.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a06ca99fa9c4c3b136f0678f6d7a811e04bdac00
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/cmuarctic.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/cmudict.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/cmudict.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52d6d262436e688aa66caba76f3e8b140736894d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/cmudict.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/commonvoice.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/commonvoice.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c010bd74a6909c3479985fe0b9a2c322207b3168
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/commonvoice.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/dr_vctk.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/dr_vctk.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a11da47c752bd5d8c2655d9d1b41827135d39c3
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/dr_vctk.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/fluentcommands.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/fluentcommands.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67ab527f8cd297893c00d36a86503c0bec5ec7da
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/fluentcommands.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/gtzan.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/gtzan.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae610bdb3ed6575cd4c5007dd1a40b4bc9e56ff8
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/gtzan.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/iemocap.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/iemocap.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75122bf50a5ede486de57b2edeea9dac4651bca9
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/iemocap.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librilight_limited.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librilight_limited.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe31c66feed0b92271bf4bd7f1aecda745f2e82d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librilight_limited.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librimix.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librimix.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d354b486099f4c0550a7698e5e9d50d511da7ccb
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librimix.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librispeech.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librispeech.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21946935a9bb3497384e893f77c29070ee50742a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librispeech.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librispeech_biasing.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librispeech_biasing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a083ef569c1b8cafbbe1b17a301bfb8cc4b8974a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/librispeech_biasing.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/libritts.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/libritts.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df80d4574f5bdfa717c821ee5f188b78a240160d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/libritts.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/ljspeech.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/ljspeech.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d4ae2e30feed67c08e32a9f8d2290a8e6818562
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/ljspeech.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/musdb_hq.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/musdb_hq.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..583064fa33c791e9a2a32febce4a84c1fc4d49c8
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/musdb_hq.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/quesst14.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/quesst14.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be78beb518b84e4de7e9e27471af380edf6fcf4e
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/quesst14.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/snips.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/snips.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7cd515220276ff1da041bd87d0e67f57948a3db4
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/snips.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/speechcommands.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/speechcommands.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78b5dcec7f172a973ec4fad6247245c7db63c805
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/speechcommands.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/tedlium.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/tedlium.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1a1cea7cd3515f1ced651945895a3002caad835
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/tedlium.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2eb6b713641560632ff3b6522832ddb151118276
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/vctk.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/vctk.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..669638a93c780c4304c6c5ebcdd6d48e57c51b0a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/vctk.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/voxceleb1.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/voxceleb1.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96cb27847b5c6a295a0cf50765ed93ee60fc2625
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/voxceleb1.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/yesno.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/yesno.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7bb9d6628e3c16c6e7531225213c8ad1146f0c2
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/datasets/__pycache__/yesno.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/cmuarctic.py b/MLPY/Lib/site-packages/torchaudio/datasets/cmuarctic.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb58159267a15ebfc74a733afef4e1e0a5ce88cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/cmuarctic.py
@@ -0,0 +1,157 @@
+import csv
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar
+
+URL = "aew"
+FOLDER_IN_ARCHIVE = "ARCTIC"
+_CHECKSUMS = {
+    "http://festvox.org/cmu_arctic/packed/cmu_us_aew_arctic.tar.bz2": "645cb33c0f0b2ce41384fdd8d3db2c3f5fc15c1e688baeb74d2e08cab18ab406",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_ahw_arctic.tar.bz2": "024664adeb892809d646a3efd043625b46b5bfa3e6189b3500b2d0d59dfab06c",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_aup_arctic.tar.bz2": "2c55bc3050caa996758869126ad10cf42e1441212111db034b3a45189c18b6fc",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_awb_arctic.tar.bz2": "d74a950c9739a65f7bfc4dfa6187f2730fa03de5b8eb3f2da97a51b74df64d3c",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_axb_arctic.tar.bz2": "dd65c3d2907d1ee52f86e44f578319159e60f4bf722a9142be01161d84e330ff",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_bdl_arctic.tar.bz2": "26b91aaf48b2799b2956792b4632c2f926cd0542f402b5452d5adecb60942904",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_clb_arctic.tar.bz2": "3f16dc3f3b97955ea22623efb33b444341013fc660677b2e170efdcc959fa7c6",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_eey_arctic.tar.bz2": "8a0ee4e5acbd4b2f61a4fb947c1730ab3adcc9dc50b195981d99391d29928e8a",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_fem_arctic.tar.bz2": "3fcff629412b57233589cdb058f730594a62c4f3a75c20de14afe06621ef45e2",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_gka_arctic.tar.bz2": "dc82e7967cbd5eddbed33074b0699128dbd4482b41711916d58103707e38c67f",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_jmk_arctic.tar.bz2": "3a37c0e1dfc91e734fdbc88b562d9e2ebca621772402cdc693bbc9b09b211d73",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_ksp_arctic.tar.bz2": "8029cafce8296f9bed3022c44ef1e7953332b6bf6943c14b929f468122532717",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_ljm_arctic.tar.bz2": "b23993765cbf2b9e7bbc3c85b6c56eaf292ac81ee4bb887b638a24d104f921a0",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_lnh_arctic.tar.bz2": "4faf34d71aa7112813252fb20c5433e2fdd9a9de55a00701ffcbf05f24a5991a",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_rms_arctic.tar.bz2": "c6dc11235629c58441c071a7ba8a2d067903dfefbaabc4056d87da35b72ecda4",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_rxr_arctic.tar.bz2": "1fa4271c393e5998d200e56c102ff46fcfea169aaa2148ad9e9469616fbfdd9b",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_slp_arctic.tar.bz2": "54345ed55e45c23d419e9a823eef427f1cc93c83a710735ec667d068c916abf1",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_slt_arctic.tar.bz2": "7c173297916acf3cc7fcab2713be4c60b27312316765a90934651d367226b4ea",  # noqa: E501
+}
+
+
+def load_cmuarctic_item(line: str, path: str, folder_audio: str, ext_audio: str) -> Tuple[Tensor, int, str, str]:
+
+    utterance_id, transcript = line[0].strip().split(" ", 2)[1:]
+
+    # Remove space, double quote, and single parenthesis from transcript
+    transcript = transcript[1:-3]
+
+    file_audio = os.path.join(path, folder_audio, utterance_id + ext_audio)
+
+    # Load audio
+    waveform, sample_rate = torchaudio.load(file_audio)
+
+    return (waveform, sample_rate, transcript, utterance_id.split("_")[1])
+
+
+class CMUARCTIC(Dataset):
+    """*CMU ARCTIC* :cite:`Kominek03cmuarctic` dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional):
+            The URL to download the dataset from or the type of the dataset to download.
+            (default: ``"aew"``)
+            Allowed type values are ``"aew"``, ``"ahw"``, ``"aup"``, ``"awb"``, ``"axb"``, ``"bdl"``,
+            ``"clb"``, ``"eey"``, ``"fem"``, ``"gka"``, ``"jmk"``, ``"ksp"``, ``"ljm"``, ``"lnh"``,
+            ``"rms"``, ``"rxr"``, ``"slp"`` or ``"slt"``.
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"ARCTIC"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    _file_text = "txt.done.data"
+    _folder_text = "etc"
+    _ext_audio = ".wav"
+    _folder_audio = "wav"
+
+    def __init__(
+        self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False
+    ) -> None:
+
+        if url in [
+            "aew",
+            "ahw",
+            "aup",
+            "awb",
+            "axb",
+            "bdl",
+            "clb",
+            "eey",
+            "fem",
+            "gka",
+            "jmk",
+            "ksp",
+            "ljm",
+            "lnh",
+            "rms",
+            "rxr",
+            "slp",
+            "slt",
+        ]:
+
+            url = "cmu_us_" + url + "_arctic"
+            ext_archive = ".tar.bz2"
+            base_url = "http://www.festvox.org/cmu_arctic/packed/"
+
+            url = os.path.join(base_url, url + ext_archive)
+
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+
+        basename = os.path.basename(url)
+        root = os.path.join(root, folder_in_archive)
+        if not os.path.isdir(root):
+            os.mkdir(root)
+        archive = os.path.join(root, basename)
+
+        basename = basename.split(".")[0]
+
+        self._path = os.path.join(root, basename)
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                _extract_tar(archive)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+        self._text = os.path.join(self._path, self._folder_text, self._file_text)
+
+        with open(self._text, "r") as text:
+            walker = csv.reader(text, delimiter="\n")
+            self._walker = list(walker)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Transcript
+            str:
+                Utterance ID
+        """
+        line = self._walker[n]
+        return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio)
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/cmudict.py b/MLPY/Lib/site-packages/torchaudio/datasets/cmudict.py
new file mode 100644
index 0000000000000000000000000000000000000000..81cee0e302f3e78e601e1ce585401bb18cd64adf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/cmudict.py
@@ -0,0 +1,186 @@
+import os
+import re
+from pathlib import Path
+from typing import Iterable, List, Tuple, Union
+
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+
+
+_CHECKSUMS = {
+    "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b": "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4",  # noqa: E501
+    "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols": "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027",  # noqa: E501
+}
+_PUNCTUATIONS = {
+    "!EXCLAMATION-POINT",
+    '"CLOSE-QUOTE',
+    '"DOUBLE-QUOTE',
+    '"END-OF-QUOTE',
+    '"END-QUOTE',
+    '"IN-QUOTES',
+    '"QUOTE',
+    '"UNQUOTE',
+    "#HASH-MARK",
+    "#POUND-SIGN",
+    "#SHARP-SIGN",
+    "%PERCENT",
+    "&AMPERSAND",
+    "'END-INNER-QUOTE",
+    "'END-QUOTE",
+    "'INNER-QUOTE",
+    "'QUOTE",
+    "'SINGLE-QUOTE",
+    "(BEGIN-PARENS",
+    "(IN-PARENTHESES",
+    "(LEFT-PAREN",
+    "(OPEN-PARENTHESES",
+    "(PAREN",
+    "(PARENS",
+    "(PARENTHESES",
+    ")CLOSE-PAREN",
+    ")CLOSE-PARENTHESES",
+    ")END-PAREN",
+    ")END-PARENS",
+    ")END-PARENTHESES",
+    ")END-THE-PAREN",
+    ")PAREN",
+    ")PARENS",
+    ")RIGHT-PAREN",
+    ")UN-PARENTHESES",
+    "+PLUS",
+    ",COMMA",
+    "--DASH",
+    "-DASH",
+    "-HYPHEN",
+    "...ELLIPSIS",
+    ".DECIMAL",
+    ".DOT",
+    ".FULL-STOP",
+    ".PERIOD",
+    ".POINT",
+    "/SLASH",
+    ":COLON",
+    ";SEMI-COLON",
+    ";SEMI-COLON(1)",
+    "?QUESTION-MARK",
+    "{BRACE",
+    "{LEFT-BRACE",
+    "{OPEN-BRACE",
+    "}CLOSE-BRACE",
+    "}RIGHT-BRACE",
+}
+
+
+def _parse_dictionary(lines: Iterable[str], exclude_punctuations: bool) -> List[str]:
+    _alt_re = re.compile(r"\([0-9]+\)")
+    cmudict: List[Tuple[str, List[str]]] = []
+    for line in lines:
+        if not line or line.startswith(";;;"):  # ignore comments
+            continue
+
+        word, phones = line.strip().split("  ")
+        if word in _PUNCTUATIONS:
+            if exclude_punctuations:
+                continue
+            # !EXCLAMATION-POINT -> !
+            # --DASH -> --
+            # ...ELLIPSIS -> ...
+            if word.startswith("..."):
+                word = "..."
+            elif word.startswith("--"):
+                word = "--"
+            else:
+                word = word[0]
+
+        # if a word have multiple pronunciations, there will be (number) appended to it
+        # for example, DATAPOINTS and DATAPOINTS(1),
+        # the regular expression `_alt_re` removes the '(1)' and change the word DATAPOINTS(1) to DATAPOINTS
+        word = re.sub(_alt_re, "", word)
+        phones = phones.split(" ")
+        cmudict.append((word, phones))
+
+    return cmudict
+
+
+class CMUDict(Dataset):
+    """*CMU Pronouncing Dictionary* :cite:`cmudict` (CMUDict) dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        exclude_punctuations (bool, optional):
+            When enabled, exclude the pronounciation of punctuations, such as
+            `!EXCLAMATION-POINT` and `#HASH-MARK`.
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        url (str, optional):
+            The URL to download the dictionary from.
+            (default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b"``)
+        url_symbols (str, optional):
+            The URL to download the list of symbols from.
+            (default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols"``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        exclude_punctuations: bool = True,
+        *,
+        download: bool = False,
+        url: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b",
+        url_symbols: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols",
+    ) -> None:
+
+        self.exclude_punctuations = exclude_punctuations
+
+        self._root_path = Path(root)
+        if not os.path.isdir(self._root_path):
+            raise RuntimeError(f"The root directory does not exist; {root}")
+
+        dict_file = self._root_path / os.path.basename(url)
+        symbol_file = self._root_path / os.path.basename(url_symbols)
+        if not os.path.exists(dict_file):
+            if not download:
+                raise RuntimeError(
+                    "The dictionary file is not found in the following location. "
+                    f"Set `download=True` to download it. {dict_file}"
+                )
+            checksum = _CHECKSUMS.get(url, None)
+            download_url_to_file(url, dict_file, checksum)
+        if not os.path.exists(symbol_file):
+            if not download:
+                raise RuntimeError(
+                    "The symbol file is not found in the following location. "
+                    f"Set `download=True` to download it. {symbol_file}"
+                )
+            checksum = _CHECKSUMS.get(url_symbols, None)
+            download_url_to_file(url_symbols, symbol_file, checksum)
+
+        with open(symbol_file, "r") as text:
+            self._symbols = [line.strip() for line in text.readlines()]
+
+        with open(dict_file, "r", encoding="latin-1") as text:
+            self._dictionary = _parse_dictionary(text.readlines(), exclude_punctuations=self.exclude_punctuations)
+
+    def __getitem__(self, n: int) -> Tuple[str, List[str]]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded.
+
+        Returns:
+            Tuple of a word and its phonemes
+
+            str:
+                Word
+            List[str]:
+                Phonemes
+        """
+        return self._dictionary[n]
+
+    def __len__(self) -> int:
+        return len(self._dictionary)
+
+    @property
+    def symbols(self) -> List[str]:
+        """list[str]: A list of phonemes symbols, such as ``"AA"``, ``"AE"``, ``"AH"``."""
+        return self._symbols.copy()
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/commonvoice.py b/MLPY/Lib/site-packages/torchaudio/datasets/commonvoice.py
new file mode 100644
index 0000000000000000000000000000000000000000..311102a3e08ca754dfc6526ff8f2b6b04bf6085f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/commonvoice.py
@@ -0,0 +1,86 @@
+import csv
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+
+
+def load_commonvoice_item(
+    line: List[str], header: List[str], path: str, folder_audio: str, ext_audio: str
+) -> Tuple[Tensor, int, Dict[str, str]]:
+    # Each line as the following data:
+    # client_id, path, sentence, up_votes, down_votes, age, gender, accent
+
+    if header[1] != "path":
+        raise ValueError(f"expect `header[1]` to be 'path', but got {header[1]}")
+    fileid = line[1]
+    filename = os.path.join(path, folder_audio, fileid)
+    if not filename.endswith(ext_audio):
+        filename += ext_audio
+    waveform, sample_rate = torchaudio.load(filename)
+
+    dic = dict(zip(header, line))
+
+    return waveform, sample_rate, dic
+
+
+class COMMONVOICE(Dataset):
+    """*CommonVoice* :cite:`ardila2020common` dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is located.
+             (Where the ``tsv`` file is present.)
+        tsv (str, optional):
+            The name of the tsv file used to construct the metadata, such as
+            ``"train.tsv"``, ``"test.tsv"``, ``"dev.tsv"``, ``"invalidated.tsv"``,
+            ``"validated.tsv"`` and ``"other.tsv"``. (default: ``"train.tsv"``)
+    """
+
+    _ext_txt = ".txt"
+    _ext_audio = ".mp3"
+    _folder_audio = "clips"
+
+    def __init__(self, root: Union[str, Path], tsv: str = "train.tsv") -> None:
+
+        # Get string representation of 'root' in case Path object is passed
+        self._path = os.fspath(root)
+        self._tsv = os.path.join(self._path, tsv)
+
+        with open(self._tsv, "r") as tsv_:
+            walker = csv.reader(tsv_, delimiter="\t")
+            self._header = next(walker)
+            self._walker = list(walker)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, Dict[str, str]]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            Dict[str, str]:
+                Dictionary containing the following items from the corresponding TSV file;
+
+                * ``"client_id"``
+                * ``"path"``
+                * ``"sentence"``
+                * ``"up_votes"``
+                * ``"down_votes"``
+                * ``"age"``
+                * ``"gender"``
+                * ``"accent"``
+        """
+        line = self._walker[n]
+        return load_commonvoice_item(line, self._header, self._path, self._folder_audio, self._ext_audio)
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/dr_vctk.py b/MLPY/Lib/site-packages/torchaudio/datasets/dr_vctk.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0dbffc6684a1eae6250800d1e434ce78a766b3c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/dr_vctk.py
@@ -0,0 +1,121 @@
+from pathlib import Path
+from typing import Dict, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_zip
+
+
+_URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"
+_CHECKSUM = "781f12f4406ed36ed27ae3bce55da47ba176e2d8bae67319e389e07b2c9bd769"
+_SUPPORTED_SUBSETS = {"train", "test"}
+
+
+class DR_VCTK(Dataset):
+    """*Device Recorded VCTK (Small subset version)* :cite:`Sarfjoo2018DeviceRV` dataset.
+
+    Args:
+        root (str or Path): Root directory where the dataset's top level directory is found.
+        subset (str): The subset to use. Can be one of ``"train"`` and ``"test"``. (default: ``"train"``).
+        download (bool):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        url (str): The URL to download the dataset from.
+            (default: ``"https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str = "train",
+        *,
+        download: bool = False,
+        url: str = _URL,
+    ) -> None:
+        if subset not in _SUPPORTED_SUBSETS:
+            raise RuntimeError(
+                f"The subset '{subset}' does not match any of the supported subsets: {_SUPPORTED_SUBSETS}"
+            )
+
+        root = Path(root).expanduser()
+        archive = root / "DR-VCTK.zip"
+
+        self._subset = subset
+        self._path = root / "DR-VCTK" / "DR-VCTK"
+        self._clean_audio_dir = self._path / f"clean_{self._subset}set_wav_16k"
+        self._noisy_audio_dir = self._path / f"device-recorded_{self._subset}set_wav_16k"
+        self._config_filepath = self._path / "configurations" / f"{self._subset}_ch_log.txt"
+
+        if not self._path.is_dir():
+            if not archive.is_file():
+                if not download:
+                    raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
+                download_url_to_file(url, archive, hash_prefix=_CHECKSUM)
+            _extract_zip(archive, root)
+
+        self._config = self._load_config(self._config_filepath)
+        self._filename_list = sorted(self._config)
+
+    def _load_config(self, filepath: str) -> Dict[str, Tuple[str, int]]:
+        # Skip header
+        skip_rows = 2 if self._subset == "train" else 1
+
+        config = {}
+        with open(filepath) as f:
+            for i, line in enumerate(f):
+                if i < skip_rows or not line:
+                    continue
+                filename, source, channel_id = line.strip().split("\t")
+                config[filename] = (source, int(channel_id))
+        return config
+
+    def _load_dr_vctk_item(self, filename: str) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]:
+        speaker_id, utterance_id = filename.split(".")[0].split("_")
+        source, channel_id = self._config[filename]
+        file_clean_audio = self._clean_audio_dir / filename
+        file_noisy_audio = self._noisy_audio_dir / filename
+        waveform_clean, sample_rate_clean = torchaudio.load(file_clean_audio)
+        waveform_noisy, sample_rate_noisy = torchaudio.load(file_noisy_audio)
+        return (
+            waveform_clean,
+            sample_rate_clean,
+            waveform_noisy,
+            sample_rate_noisy,
+            speaker_id,
+            utterance_id,
+            source,
+            channel_id,
+        )
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Clean waveform
+            int:
+                Sample rate of the clean waveform
+            Tensor:
+                Noisy waveform
+            int:
+                Sample rate of the noisy waveform
+            str:
+                Speaker ID
+            str:
+                Utterance ID
+            str:
+                Source
+            int:
+                Channel ID
+        """
+        filename = self._filename_list[n]
+        return self._load_dr_vctk_item(filename)
+
+    def __len__(self) -> int:
+        return len(self._filename_list)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/fluentcommands.py b/MLPY/Lib/site-packages/torchaudio/datasets/fluentcommands.py
new file mode 100644
index 0000000000000000000000000000000000000000..aacfc9cfcba14d6f2636e57138edccdcca29b5cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/fluentcommands.py
@@ -0,0 +1,108 @@
+import csv
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import _load_waveform
+
+SAMPLE_RATE = 16000
+
+
+class FluentSpeechCommands(Dataset):
+    """*Fluent Speech Commands* :cite:`fluent` dataset
+
+    Args:
+        root (str of Path): Path to the directory where the dataset is found.
+        subset (str, optional): subset of the dataset to use.
+            Options: [``"train"``, ``"valid"``, ``"test"``].
+            (Default: ``"train"``)
+    """
+
+    def __init__(self, root: Union[str, Path], subset: str = "train"):
+        if subset not in ["train", "valid", "test"]:
+            raise ValueError("`subset` must be one of ['train', 'valid', 'test']")
+
+        root = os.fspath(root)
+        self._path = os.path.join(root, "fluent_speech_commands_dataset")
+
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found.")
+
+        subset_path = os.path.join(self._path, "data", f"{subset}_data.csv")
+        with open(subset_path) as subset_csv:
+            subset_reader = csv.reader(subset_csv)
+            data = list(subset_reader)
+
+        self.header = data[0]
+        self.data = data[1:]
+
+    def get_metadata(self, n: int) -> Tuple[str, int, str, int, str, str, str, str]:
+        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            str:
+                Path to audio
+            int:
+                Sample rate
+            str:
+                File name
+            int:
+                Speaker ID
+            str:
+                Transcription
+            str:
+                Action
+            str:
+                Object
+            str:
+                Location
+        """
+        sample = self.data[n]
+
+        file_name = sample[self.header.index("path")].split("/")[-1]
+        file_name = file_name.split(".")[0]
+        speaker_id, transcription, action, obj, location = sample[2:]
+        file_path = os.path.join("wavs", "speakers", speaker_id, f"{file_name}.wav")
+
+        return file_path, SAMPLE_RATE, file_name, speaker_id, transcription, action, obj, location
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, str, str, str, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                File name
+            int:
+                Speaker ID
+            str:
+                Transcription
+            str:
+                Action
+            str:
+                Object
+            str:
+                Location
+        """
+        metadata = self.get_metadata(n)
+        waveform = _load_waveform(self._path, metadata[0], metadata[1])
+        return (waveform,) + metadata[1:]
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/gtzan.py b/MLPY/Lib/site-packages/torchaudio/datasets/gtzan.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fd649f7536d2cbc251b3857d0bb780ea974ce34
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/gtzan.py
@@ -0,0 +1,1118 @@
+import os
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar
+
+# The following lists prefixed with `filtered_` provide a filtered split
+# that:
+#
+# a. Mitigate a known issue with GTZAN (duplication)
+#
+# b. Provide a standard split for testing it against other
+#    methods (e.g. the one in jordipons/sklearn-audio-transfer-learning).
+#
+# Those are used when GTZAN is initialised with the `filtered` keyword.
+# The split was taken from (github) jordipons/sklearn-audio-transfer-learning.
+
+gtzan_genres = [
+    "blues",
+    "classical",
+    "country",
+    "disco",
+    "hiphop",
+    "jazz",
+    "metal",
+    "pop",
+    "reggae",
+    "rock",
+]
+
+filtered_test = [
+    "blues.00012",
+    "blues.00013",
+    "blues.00014",
+    "blues.00015",
+    "blues.00016",
+    "blues.00017",
+    "blues.00018",
+    "blues.00019",
+    "blues.00020",
+    "blues.00021",
+    "blues.00022",
+    "blues.00023",
+    "blues.00024",
+    "blues.00025",
+    "blues.00026",
+    "blues.00027",
+    "blues.00028",
+    "blues.00061",
+    "blues.00062",
+    "blues.00063",
+    "blues.00064",
+    "blues.00065",
+    "blues.00066",
+    "blues.00067",
+    "blues.00068",
+    "blues.00069",
+    "blues.00070",
+    "blues.00071",
+    "blues.00072",
+    "blues.00098",
+    "blues.00099",
+    "classical.00011",
+    "classical.00012",
+    "classical.00013",
+    "classical.00014",
+    "classical.00015",
+    "classical.00016",
+    "classical.00017",
+    "classical.00018",
+    "classical.00019",
+    "classical.00020",
+    "classical.00021",
+    "classical.00022",
+    "classical.00023",
+    "classical.00024",
+    "classical.00025",
+    "classical.00026",
+    "classical.00027",
+    "classical.00028",
+    "classical.00029",
+    "classical.00034",
+    "classical.00035",
+    "classical.00036",
+    "classical.00037",
+    "classical.00038",
+    "classical.00039",
+    "classical.00040",
+    "classical.00041",
+    "classical.00049",
+    "classical.00077",
+    "classical.00078",
+    "classical.00079",
+    "country.00030",
+    "country.00031",
+    "country.00032",
+    "country.00033",
+    "country.00034",
+    "country.00035",
+    "country.00036",
+    "country.00037",
+    "country.00038",
+    "country.00039",
+    "country.00040",
+    "country.00043",
+    "country.00044",
+    "country.00046",
+    "country.00047",
+    "country.00048",
+    "country.00050",
+    "country.00051",
+    "country.00053",
+    "country.00054",
+    "country.00055",
+    "country.00056",
+    "country.00057",
+    "country.00058",
+    "country.00059",
+    "country.00060",
+    "country.00061",
+    "country.00062",
+    "country.00063",
+    "country.00064",
+    "disco.00001",
+    "disco.00021",
+    "disco.00058",
+    "disco.00062",
+    "disco.00063",
+    "disco.00064",
+    "disco.00065",
+    "disco.00066",
+    "disco.00069",
+    "disco.00076",
+    "disco.00077",
+    "disco.00078",
+    "disco.00079",
+    "disco.00080",
+    "disco.00081",
+    "disco.00082",
+    "disco.00083",
+    "disco.00084",
+    "disco.00085",
+    "disco.00086",
+    "disco.00087",
+    "disco.00088",
+    "disco.00091",
+    "disco.00092",
+    "disco.00093",
+    "disco.00094",
+    "disco.00096",
+    "disco.00097",
+    "disco.00099",
+    "hiphop.00000",
+    "hiphop.00026",
+    "hiphop.00027",
+    "hiphop.00030",
+    "hiphop.00040",
+    "hiphop.00043",
+    "hiphop.00044",
+    "hiphop.00045",
+    "hiphop.00051",
+    "hiphop.00052",
+    "hiphop.00053",
+    "hiphop.00054",
+    "hiphop.00062",
+    "hiphop.00063",
+    "hiphop.00064",
+    "hiphop.00065",
+    "hiphop.00066",
+    "hiphop.00067",
+    "hiphop.00068",
+    "hiphop.00069",
+    "hiphop.00070",
+    "hiphop.00071",
+    "hiphop.00072",
+    "hiphop.00073",
+    "hiphop.00074",
+    "hiphop.00075",
+    "hiphop.00099",
+    "jazz.00073",
+    "jazz.00074",
+    "jazz.00075",
+    "jazz.00076",
+    "jazz.00077",
+    "jazz.00078",
+    "jazz.00079",
+    "jazz.00080",
+    "jazz.00081",
+    "jazz.00082",
+    "jazz.00083",
+    "jazz.00084",
+    "jazz.00085",
+    "jazz.00086",
+    "jazz.00087",
+    "jazz.00088",
+    "jazz.00089",
+    "jazz.00090",
+    "jazz.00091",
+    "jazz.00092",
+    "jazz.00093",
+    "jazz.00094",
+    "jazz.00095",
+    "jazz.00096",
+    "jazz.00097",
+    "jazz.00098",
+    "jazz.00099",
+    "metal.00012",
+    "metal.00013",
+    "metal.00014",
+    "metal.00015",
+    "metal.00022",
+    "metal.00023",
+    "metal.00025",
+    "metal.00026",
+    "metal.00027",
+    "metal.00028",
+    "metal.00029",
+    "metal.00030",
+    "metal.00031",
+    "metal.00032",
+    "metal.00033",
+    "metal.00038",
+    "metal.00039",
+    "metal.00067",
+    "metal.00070",
+    "metal.00073",
+    "metal.00074",
+    "metal.00075",
+    "metal.00078",
+    "metal.00083",
+    "metal.00085",
+    "metal.00087",
+    "metal.00088",
+    "pop.00000",
+    "pop.00001",
+    "pop.00013",
+    "pop.00014",
+    "pop.00043",
+    "pop.00063",
+    "pop.00064",
+    "pop.00065",
+    "pop.00066",
+    "pop.00069",
+    "pop.00070",
+    "pop.00071",
+    "pop.00072",
+    "pop.00073",
+    "pop.00074",
+    "pop.00075",
+    "pop.00076",
+    "pop.00077",
+    "pop.00078",
+    "pop.00079",
+    "pop.00082",
+    "pop.00088",
+    "pop.00089",
+    "pop.00090",
+    "pop.00091",
+    "pop.00092",
+    "pop.00093",
+    "pop.00094",
+    "pop.00095",
+    "pop.00096",
+    "reggae.00034",
+    "reggae.00035",
+    "reggae.00036",
+    "reggae.00037",
+    "reggae.00038",
+    "reggae.00039",
+    "reggae.00040",
+    "reggae.00046",
+    "reggae.00047",
+    "reggae.00048",
+    "reggae.00052",
+    "reggae.00053",
+    "reggae.00064",
+    "reggae.00065",
+    "reggae.00066",
+    "reggae.00067",
+    "reggae.00068",
+    "reggae.00071",
+    "reggae.00079",
+    "reggae.00082",
+    "reggae.00083",
+    "reggae.00084",
+    "reggae.00087",
+    "reggae.00088",
+    "reggae.00089",
+    "reggae.00090",
+    "rock.00010",
+    "rock.00011",
+    "rock.00012",
+    "rock.00013",
+    "rock.00014",
+    "rock.00015",
+    "rock.00027",
+    "rock.00028",
+    "rock.00029",
+    "rock.00030",
+    "rock.00031",
+    "rock.00032",
+    "rock.00033",
+    "rock.00034",
+    "rock.00035",
+    "rock.00036",
+    "rock.00037",
+    "rock.00039",
+    "rock.00040",
+    "rock.00041",
+    "rock.00042",
+    "rock.00043",
+    "rock.00044",
+    "rock.00045",
+    "rock.00046",
+    "rock.00047",
+    "rock.00048",
+    "rock.00086",
+    "rock.00087",
+    "rock.00088",
+    "rock.00089",
+    "rock.00090",
+]
+
+filtered_train = [
+    "blues.00029",
+    "blues.00030",
+    "blues.00031",
+    "blues.00032",
+    "blues.00033",
+    "blues.00034",
+    "blues.00035",
+    "blues.00036",
+    "blues.00037",
+    "blues.00038",
+    "blues.00039",
+    "blues.00040",
+    "blues.00041",
+    "blues.00042",
+    "blues.00043",
+    "blues.00044",
+    "blues.00045",
+    "blues.00046",
+    "blues.00047",
+    "blues.00048",
+    "blues.00049",
+    "blues.00073",
+    "blues.00074",
+    "blues.00075",
+    "blues.00076",
+    "blues.00077",
+    "blues.00078",
+    "blues.00079",
+    "blues.00080",
+    "blues.00081",
+    "blues.00082",
+    "blues.00083",
+    "blues.00084",
+    "blues.00085",
+    "blues.00086",
+    "blues.00087",
+    "blues.00088",
+    "blues.00089",
+    "blues.00090",
+    "blues.00091",
+    "blues.00092",
+    "blues.00093",
+    "blues.00094",
+    "blues.00095",
+    "blues.00096",
+    "blues.00097",
+    "classical.00030",
+    "classical.00031",
+    "classical.00032",
+    "classical.00033",
+    "classical.00043",
+    "classical.00044",
+    "classical.00045",
+    "classical.00046",
+    "classical.00047",
+    "classical.00048",
+    "classical.00050",
+    "classical.00051",
+    "classical.00052",
+    "classical.00053",
+    "classical.00054",
+    "classical.00055",
+    "classical.00056",
+    "classical.00057",
+    "classical.00058",
+    "classical.00059",
+    "classical.00060",
+    "classical.00061",
+    "classical.00062",
+    "classical.00063",
+    "classical.00064",
+    "classical.00065",
+    "classical.00066",
+    "classical.00067",
+    "classical.00080",
+    "classical.00081",
+    "classical.00082",
+    "classical.00083",
+    "classical.00084",
+    "classical.00085",
+    "classical.00086",
+    "classical.00087",
+    "classical.00088",
+    "classical.00089",
+    "classical.00090",
+    "classical.00091",
+    "classical.00092",
+    "classical.00093",
+    "classical.00094",
+    "classical.00095",
+    "classical.00096",
+    "classical.00097",
+    "classical.00098",
+    "classical.00099",
+    "country.00019",
+    "country.00020",
+    "country.00021",
+    "country.00022",
+    "country.00023",
+    "country.00024",
+    "country.00025",
+    "country.00026",
+    "country.00028",
+    "country.00029",
+    "country.00065",
+    "country.00066",
+    "country.00067",
+    "country.00068",
+    "country.00069",
+    "country.00070",
+    "country.00071",
+    "country.00072",
+    "country.00073",
+    "country.00074",
+    "country.00075",
+    "country.00076",
+    "country.00077",
+    "country.00078",
+    "country.00079",
+    "country.00080",
+    "country.00081",
+    "country.00082",
+    "country.00083",
+    "country.00084",
+    "country.00085",
+    "country.00086",
+    "country.00087",
+    "country.00088",
+    "country.00089",
+    "country.00090",
+    "country.00091",
+    "country.00092",
+    "country.00093",
+    "country.00094",
+    "country.00095",
+    "country.00096",
+    "country.00097",
+    "country.00098",
+    "country.00099",
+    "disco.00005",
+    "disco.00015",
+    "disco.00016",
+    "disco.00017",
+    "disco.00018",
+    "disco.00019",
+    "disco.00020",
+    "disco.00022",
+    "disco.00023",
+    "disco.00024",
+    "disco.00025",
+    "disco.00026",
+    "disco.00027",
+    "disco.00028",
+    "disco.00029",
+    "disco.00030",
+    "disco.00031",
+    "disco.00032",
+    "disco.00033",
+    "disco.00034",
+    "disco.00035",
+    "disco.00036",
+    "disco.00037",
+    "disco.00039",
+    "disco.00040",
+    "disco.00041",
+    "disco.00042",
+    "disco.00043",
+    "disco.00044",
+    "disco.00045",
+    "disco.00047",
+    "disco.00049",
+    "disco.00053",
+    "disco.00054",
+    "disco.00056",
+    "disco.00057",
+    "disco.00059",
+    "disco.00061",
+    "disco.00070",
+    "disco.00073",
+    "disco.00074",
+    "disco.00089",
+    "hiphop.00002",
+    "hiphop.00003",
+    "hiphop.00004",
+    "hiphop.00005",
+    "hiphop.00006",
+    "hiphop.00007",
+    "hiphop.00008",
+    "hiphop.00009",
+    "hiphop.00010",
+    "hiphop.00011",
+    "hiphop.00012",
+    "hiphop.00013",
+    "hiphop.00014",
+    "hiphop.00015",
+    "hiphop.00016",
+    "hiphop.00017",
+    "hiphop.00018",
+    "hiphop.00019",
+    "hiphop.00020",
+    "hiphop.00021",
+    "hiphop.00022",
+    "hiphop.00023",
+    "hiphop.00024",
+    "hiphop.00025",
+    "hiphop.00028",
+    "hiphop.00029",
+    "hiphop.00031",
+    "hiphop.00032",
+    "hiphop.00033",
+    "hiphop.00034",
+    "hiphop.00035",
+    "hiphop.00036",
+    "hiphop.00037",
+    "hiphop.00038",
+    "hiphop.00041",
+    "hiphop.00042",
+    "hiphop.00055",
+    "hiphop.00056",
+    "hiphop.00057",
+    "hiphop.00058",
+    "hiphop.00059",
+    "hiphop.00060",
+    "hiphop.00061",
+    "hiphop.00077",
+    "hiphop.00078",
+    "hiphop.00079",
+    "hiphop.00080",
+    "jazz.00000",
+    "jazz.00001",
+    "jazz.00011",
+    "jazz.00012",
+    "jazz.00013",
+    "jazz.00014",
+    "jazz.00015",
+    "jazz.00016",
+    "jazz.00017",
+    "jazz.00018",
+    "jazz.00019",
+    "jazz.00020",
+    "jazz.00021",
+    "jazz.00022",
+    "jazz.00023",
+    "jazz.00024",
+    "jazz.00041",
+    "jazz.00047",
+    "jazz.00048",
+    "jazz.00049",
+    "jazz.00050",
+    "jazz.00051",
+    "jazz.00052",
+    "jazz.00053",
+    "jazz.00054",
+    "jazz.00055",
+    "jazz.00056",
+    "jazz.00057",
+    "jazz.00058",
+    "jazz.00059",
+    "jazz.00060",
+    "jazz.00061",
+    "jazz.00062",
+    "jazz.00063",
+    "jazz.00064",
+    "jazz.00065",
+    "jazz.00066",
+    "jazz.00067",
+    "jazz.00068",
+    "jazz.00069",
+    "jazz.00070",
+    "jazz.00071",
+    "jazz.00072",
+    "metal.00002",
+    "metal.00003",
+    "metal.00005",
+    "metal.00021",
+    "metal.00024",
+    "metal.00035",
+    "metal.00046",
+    "metal.00047",
+    "metal.00048",
+    "metal.00049",
+    "metal.00050",
+    "metal.00051",
+    "metal.00052",
+    "metal.00053",
+    "metal.00054",
+    "metal.00055",
+    "metal.00056",
+    "metal.00057",
+    "metal.00059",
+    "metal.00060",
+    "metal.00061",
+    "metal.00062",
+    "metal.00063",
+    "metal.00064",
+    "metal.00065",
+    "metal.00066",
+    "metal.00069",
+    "metal.00071",
+    "metal.00072",
+    "metal.00079",
+    "metal.00080",
+    "metal.00084",
+    "metal.00086",
+    "metal.00089",
+    "metal.00090",
+    "metal.00091",
+    "metal.00092",
+    "metal.00093",
+    "metal.00094",
+    "metal.00095",
+    "metal.00096",
+    "metal.00097",
+    "metal.00098",
+    "metal.00099",
+    "pop.00002",
+    "pop.00003",
+    "pop.00004",
+    "pop.00005",
+    "pop.00006",
+    "pop.00007",
+    "pop.00008",
+    "pop.00009",
+    "pop.00011",
+    "pop.00012",
+    "pop.00016",
+    "pop.00017",
+    "pop.00018",
+    "pop.00019",
+    "pop.00020",
+    "pop.00023",
+    "pop.00024",
+    "pop.00025",
+    "pop.00026",
+    "pop.00027",
+    "pop.00028",
+    "pop.00029",
+    "pop.00031",
+    "pop.00032",
+    "pop.00033",
+    "pop.00034",
+    "pop.00035",
+    "pop.00036",
+    "pop.00038",
+    "pop.00039",
+    "pop.00040",
+    "pop.00041",
+    "pop.00042",
+    "pop.00044",
+    "pop.00046",
+    "pop.00049",
+    "pop.00050",
+    "pop.00080",
+    "pop.00097",
+    "pop.00098",
+    "pop.00099",
+    "reggae.00000",
+    "reggae.00001",
+    "reggae.00002",
+    "reggae.00004",
+    "reggae.00006",
+    "reggae.00009",
+    "reggae.00011",
+    "reggae.00012",
+    "reggae.00014",
+    "reggae.00015",
+    "reggae.00016",
+    "reggae.00017",
+    "reggae.00018",
+    "reggae.00019",
+    "reggae.00020",
+    "reggae.00021",
+    "reggae.00022",
+    "reggae.00023",
+    "reggae.00024",
+    "reggae.00025",
+    "reggae.00026",
+    "reggae.00027",
+    "reggae.00028",
+    "reggae.00029",
+    "reggae.00030",
+    "reggae.00031",
+    "reggae.00032",
+    "reggae.00042",
+    "reggae.00043",
+    "reggae.00044",
+    "reggae.00045",
+    "reggae.00049",
+    "reggae.00050",
+    "reggae.00051",
+    "reggae.00054",
+    "reggae.00055",
+    "reggae.00056",
+    "reggae.00057",
+    "reggae.00058",
+    "reggae.00059",
+    "reggae.00060",
+    "reggae.00063",
+    "reggae.00069",
+    "rock.00000",
+    "rock.00001",
+    "rock.00002",
+    "rock.00003",
+    "rock.00004",
+    "rock.00005",
+    "rock.00006",
+    "rock.00007",
+    "rock.00008",
+    "rock.00009",
+    "rock.00016",
+    "rock.00017",
+    "rock.00018",
+    "rock.00019",
+    "rock.00020",
+    "rock.00021",
+    "rock.00022",
+    "rock.00023",
+    "rock.00024",
+    "rock.00025",
+    "rock.00026",
+    "rock.00057",
+    "rock.00058",
+    "rock.00059",
+    "rock.00060",
+    "rock.00061",
+    "rock.00062",
+    "rock.00063",
+    "rock.00064",
+    "rock.00065",
+    "rock.00066",
+    "rock.00067",
+    "rock.00068",
+    "rock.00069",
+    "rock.00070",
+    "rock.00091",
+    "rock.00092",
+    "rock.00093",
+    "rock.00094",
+    "rock.00095",
+    "rock.00096",
+    "rock.00097",
+    "rock.00098",
+    "rock.00099",
+]
+
+filtered_valid = [
+    "blues.00000",
+    "blues.00001",
+    "blues.00002",
+    "blues.00003",
+    "blues.00004",
+    "blues.00005",
+    "blues.00006",
+    "blues.00007",
+    "blues.00008",
+    "blues.00009",
+    "blues.00010",
+    "blues.00011",
+    "blues.00050",
+    "blues.00051",
+    "blues.00052",
+    "blues.00053",
+    "blues.00054",
+    "blues.00055",
+    "blues.00056",
+    "blues.00057",
+    "blues.00058",
+    "blues.00059",
+    "blues.00060",
+    "classical.00000",
+    "classical.00001",
+    "classical.00002",
+    "classical.00003",
+    "classical.00004",
+    "classical.00005",
+    "classical.00006",
+    "classical.00007",
+    "classical.00008",
+    "classical.00009",
+    "classical.00010",
+    "classical.00068",
+    "classical.00069",
+    "classical.00070",
+    "classical.00071",
+    "classical.00072",
+    "classical.00073",
+    "classical.00074",
+    "classical.00075",
+    "classical.00076",
+    "country.00000",
+    "country.00001",
+    "country.00002",
+    "country.00003",
+    "country.00004",
+    "country.00005",
+    "country.00006",
+    "country.00007",
+    "country.00009",
+    "country.00010",
+    "country.00011",
+    "country.00012",
+    "country.00013",
+    "country.00014",
+    "country.00015",
+    "country.00016",
+    "country.00017",
+    "country.00018",
+    "country.00027",
+    "country.00041",
+    "country.00042",
+    "country.00045",
+    "country.00049",
+    "disco.00000",
+    "disco.00002",
+    "disco.00003",
+    "disco.00004",
+    "disco.00006",
+    "disco.00007",
+    "disco.00008",
+    "disco.00009",
+    "disco.00010",
+    "disco.00011",
+    "disco.00012",
+    "disco.00013",
+    "disco.00014",
+    "disco.00046",
+    "disco.00048",
+    "disco.00052",
+    "disco.00067",
+    "disco.00068",
+    "disco.00072",
+    "disco.00075",
+    "disco.00090",
+    "disco.00095",
+    "hiphop.00081",
+    "hiphop.00082",
+    "hiphop.00083",
+    "hiphop.00084",
+    "hiphop.00085",
+    "hiphop.00086",
+    "hiphop.00087",
+    "hiphop.00088",
+    "hiphop.00089",
+    "hiphop.00090",
+    "hiphop.00091",
+    "hiphop.00092",
+    "hiphop.00093",
+    "hiphop.00094",
+    "hiphop.00095",
+    "hiphop.00096",
+    "hiphop.00097",
+    "hiphop.00098",
+    "jazz.00002",
+    "jazz.00003",
+    "jazz.00004",
+    "jazz.00005",
+    "jazz.00006",
+    "jazz.00007",
+    "jazz.00008",
+    "jazz.00009",
+    "jazz.00010",
+    "jazz.00025",
+    "jazz.00026",
+    "jazz.00027",
+    "jazz.00028",
+    "jazz.00029",
+    "jazz.00030",
+    "jazz.00031",
+    "jazz.00032",
+    "metal.00000",
+    "metal.00001",
+    "metal.00006",
+    "metal.00007",
+    "metal.00008",
+    "metal.00009",
+    "metal.00010",
+    "metal.00011",
+    "metal.00016",
+    "metal.00017",
+    "metal.00018",
+    "metal.00019",
+    "metal.00020",
+    "metal.00036",
+    "metal.00037",
+    "metal.00068",
+    "metal.00076",
+    "metal.00077",
+    "metal.00081",
+    "metal.00082",
+    "pop.00010",
+    "pop.00053",
+    "pop.00055",
+    "pop.00058",
+    "pop.00059",
+    "pop.00060",
+    "pop.00061",
+    "pop.00062",
+    "pop.00081",
+    "pop.00083",
+    "pop.00084",
+    "pop.00085",
+    "pop.00086",
+    "reggae.00061",
+    "reggae.00062",
+    "reggae.00070",
+    "reggae.00072",
+    "reggae.00074",
+    "reggae.00076",
+    "reggae.00077",
+    "reggae.00078",
+    "reggae.00085",
+    "reggae.00092",
+    "reggae.00093",
+    "reggae.00094",
+    "reggae.00095",
+    "reggae.00096",
+    "reggae.00097",
+    "reggae.00098",
+    "reggae.00099",
+    "rock.00038",
+    "rock.00049",
+    "rock.00050",
+    "rock.00051",
+    "rock.00052",
+    "rock.00053",
+    "rock.00054",
+    "rock.00055",
+    "rock.00056",
+    "rock.00071",
+    "rock.00072",
+    "rock.00073",
+    "rock.00074",
+    "rock.00075",
+    "rock.00076",
+    "rock.00077",
+    "rock.00078",
+    "rock.00079",
+    "rock.00080",
+    "rock.00081",
+    "rock.00082",
+    "rock.00083",
+    "rock.00084",
+    "rock.00085",
+]
+
+
+URL = "http://opihi.cs.uvic.ca/sound/genres.tar.gz"
+FOLDER_IN_ARCHIVE = "genres"
+_CHECKSUMS = {
+    "http://opihi.cs.uvic.ca/sound/genres.tar.gz": "24347e0223d2ba798e0a558c4c172d9d4a19c00bb7963fe055d183dadb4ef2c6"
+}
+
+
+def load_gtzan_item(fileid: str, path: str, ext_audio: str) -> Tuple[Tensor, str]:
+    """
+    Loads a file from the dataset and returns the raw waveform
+    as a Torch Tensor, its sample rate as an integer, and its
+    genre as a string.
+    """
+    # Filenames are of the form label.id, e.g. blues.00078
+    label, _ = fileid.split(".")
+
+    # Read wav
+    file_audio = os.path.join(path, label, fileid + ext_audio)
+    waveform, sample_rate = torchaudio.load(file_audio)
+
+    return waveform, sample_rate, label
+
+
+class GTZAN(Dataset):
+    """*GTZAN* :cite:`tzanetakis_essl_cook_2001` dataset.
+
+    Note:
+        Please see http://marsyas.info/downloads/datasets.html if you are planning to use
+        this dataset to publish results.
+
+    Note:
+        As of October 2022, the download link is not currently working. Setting ``download=True``
+        in GTZAN dataset will result in a URL connection error.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from.
+            (default: ``"http://opihi.cs.uvic.ca/sound/genres.tar.gz"``)
+        folder_in_archive (str, optional): The top-level directory of the dataset.
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        subset (str or None, optional): Which subset of the dataset to use.
+            One of ``"training"``, ``"validation"``, ``"testing"`` or ``None``.
+            If ``None``, the entire dataset is used. (default: ``None``).
+    """
+
+    _ext_audio = ".wav"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+        subset: Optional[str] = None,
+    ) -> None:
+
+        # super(GTZAN, self).__init__()
+
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+
+        self.root = root
+        self.url = url
+        self.folder_in_archive = folder_in_archive
+        self.download = download
+        self.subset = subset
+
+        if subset is not None and subset not in ["training", "validation", "testing"]:
+            raise ValueError("When `subset` is not None, it must be one of ['training', 'validation', 'testing'].")
+
+        archive = os.path.basename(url)
+        archive = os.path.join(root, archive)
+        self._path = os.path.join(root, folder_in_archive)
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                _extract_tar(archive)
+
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
+
+        if self.subset is None:
+            # Check every subdirectory under dataset root
+            # which has the same name as the genres in
+            # GTZAN (e.g. `root_dir'/blues/, `root_dir'/rock, etc.)
+            # This lets users remove or move around song files,
+            # useful when e.g. they want to use only some of the files
+            # in a genre or want to label other files with a different
+            # genre.
+            self._walker = []
+
+            root = os.path.expanduser(self._path)
+
+            for directory in gtzan_genres:
+                fulldir = os.path.join(root, directory)
+
+                if not os.path.exists(fulldir):
+                    continue
+
+                songs_in_genre = os.listdir(fulldir)
+                songs_in_genre.sort()
+                for fname in songs_in_genre:
+                    name, ext = os.path.splitext(fname)
+                    if ext.lower() == ".wav" and "." in name:
+                        # Check whether the file is of the form
+                        # `gtzan_genre`.`5 digit number`.wav
+                        genre, num = name.split(".")
+                        if genre in gtzan_genres and len(num) == 5 and num.isdigit():
+                            self._walker.append(name)
+        else:
+            if self.subset == "training":
+                self._walker = filtered_train
+            elif self.subset == "validation":
+                self._walker = filtered_valid
+            elif self.subset == "testing":
+                self._walker = filtered_test
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Label
+        """
+        fileid = self._walker[n]
+        item = load_gtzan_item(fileid, self._path, self._ext_audio)
+        waveform, sample_rate, label = item
+        return waveform, sample_rate, label
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/iemocap.py b/MLPY/Lib/site-packages/torchaudio/datasets/iemocap.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eb62b6b7eff61d76d52f29b7ed3dfac05de7c52
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/iemocap.py
@@ -0,0 +1,147 @@
+import os
+import re
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import _load_waveform
+
+
+_SAMPLE_RATE = 16000
+
+
+def _get_wavs_paths(data_dir):
+    wav_dir = data_dir / "sentences" / "wav"
+    wav_paths = sorted(str(p) for p in wav_dir.glob("*/*.wav"))
+    relative_paths = []
+    for wav_path in wav_paths:
+        start = wav_path.find("Session")
+        wav_path = wav_path[start:]
+        relative_paths.append(wav_path)
+    return relative_paths
+
+
+class IEMOCAP(Dataset):
+    """*IEMOCAP* :cite:`iemocap` dataset.
+
+    Args:
+        root (str or Path): Root directory where the dataset's top level directory is found
+        sessions (Tuple[int]): Tuple of sessions (1-5) to use. (Default: ``(1, 2, 3, 4, 5)``)
+        utterance_type (str or None, optional): Which type(s) of utterances to include in the dataset.
+            Options: ("scripted", "improvised", ``None``). If ``None``, both scripted and improvised
+            data are used.
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        sessions: Tuple[str] = (1, 2, 3, 4, 5),
+        utterance_type: Optional[str] = None,
+    ):
+        root = Path(root)
+        self._path = root / "IEMOCAP"
+
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found.")
+
+        if utterance_type not in ["scripted", "improvised", None]:
+            raise ValueError("utterance_type must be one of ['scripted', 'improvised', or None]")
+
+        all_data = []
+        self.data = []
+        self.mapping = {}
+
+        for session in sessions:
+            session_name = f"Session{session}"
+            session_dir = self._path / session_name
+
+            # get wav paths
+            wav_paths = _get_wavs_paths(session_dir)
+            for wav_path in wav_paths:
+                wav_stem = str(Path(wav_path).stem)
+                all_data.append(wav_stem)
+
+            # add labels
+            label_dir = session_dir / "dialog" / "EmoEvaluation"
+            query = "*.txt"
+            if utterance_type == "scripted":
+                query = "*script*.txt"
+            elif utterance_type == "improvised":
+                query = "*impro*.txt"
+            label_paths = label_dir.glob(query)
+
+            for label_path in label_paths:
+                with open(label_path, "r") as f:
+                    for line in f:
+                        if not line.startswith("["):
+                            continue
+                        line = re.split("[\t\n]", line)
+                        wav_stem = line[1]
+                        label = line[2]
+                        if wav_stem not in all_data:
+                            continue
+                        if label not in ["neu", "hap", "ang", "sad", "exc", "fru"]:
+                            continue
+                        self.mapping[wav_stem] = {}
+                        self.mapping[wav_stem]["label"] = label
+
+            for wav_path in wav_paths:
+                wav_stem = str(Path(wav_path).stem)
+                if wav_stem in self.mapping:
+                    self.data.append(wav_stem)
+                    self.mapping[wav_stem]["path"] = wav_path
+
+    def get_metadata(self, n: int) -> Tuple[str, int, str, str, str]:
+        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:meth:`__getitem__`.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            str:
+                Path to audio
+            int:
+                Sample rate
+            str:
+                File name
+            str:
+                Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``, ``"exc"``, ``"fru"``)
+            str:
+                Speaker
+        """
+        wav_stem = self.data[n]
+        wav_path = self.mapping[wav_stem]["path"]
+        label = self.mapping[wav_stem]["label"]
+        speaker = wav_stem.split("_")[0]
+        return (wav_path, _SAMPLE_RATE, wav_stem, label, speaker)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                File name
+            str:
+                Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``, ``"exc"``, ``"fru"``)
+            str:
+                Speaker
+        """
+        metadata = self.get_metadata(n)
+        waveform = _load_waveform(self._path, metadata[0], metadata[1])
+        return (waveform,) + metadata[1:]
+
+    def __len__(self):
+        return len(self.data)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/librilight_limited.py b/MLPY/Lib/site-packages/torchaudio/datasets/librilight_limited.py
new file mode 100644
index 0000000000000000000000000000000000000000..89e3ae4822490f761a01d536d20126aff8864bda
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/librilight_limited.py
@@ -0,0 +1,111 @@
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.librispeech import _get_librispeech_metadata
+from torchaudio.datasets.utils import _extract_tar
+
+
+_ARCHIVE_NAME = "librispeech_finetuning"
+_URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
+_CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
+_SUBSET_MAP = {"10min": ["1h/0"], "1h": ["1h/*"], "10h": ["1h/*", "9h"]}
+
+
+def _get_fileids_paths(path: Path, folders: List[str], _ext_audio: str) -> List[Tuple[str, str]]:
+    """Get the file names and the corresponding file paths without `speaker_id`
+    and `chapter_id` directories.
+    The format of path is like:
+        {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
+        {root}/{_ARCHIVE_NAME}/9h/[clean, other]
+
+    Args:
+        path (Path): Root path to the dataset.
+        folders (List[str]): Folders that contain the desired audio files.
+        _ext_audio (str): Extension of audio files.
+
+    Returns:
+        List[Tuple[str, str]]:
+            List of tuples where the first element is the relative path to the audio file.
+            The format of relative path is like:
+            1h/[0-5]/[clean, other] or 9h/[clean, other]
+            The second element is the file name without audio extension.
+    """
+
+    path = Path(path)
+    files_paths = []
+    for folder in folders:
+        paths = [p.relative_to(path) for p in path.glob(f"{folder}/*/*/*/*{_ext_audio}")]
+        files_paths += [(str(p.parent.parent.parent), str(p.stem)) for p in paths]  # get subset folder and file name
+    files_paths.sort(key=lambda x: x[0] + x[1])
+    return files_paths
+
+
+class LibriLightLimited(Dataset):
+    """Subset of Libri-light :cite:`librilight` dataset,
+    which was used in HuBERT :cite:`hsu2021hubert` for supervised fine-tuning.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        subset (str, optional): The subset to use. Options: [``"10min"``, ``"1h"``, ``"10h"``]
+            (Default: ``"10min"``).
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    _ext_txt = ".trans.txt"
+    _ext_audio = ".flac"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str = "10min",
+        download: bool = False,
+    ) -> None:
+        if subset not in _SUBSET_MAP:
+            raise ValueError(f"`subset` must be one of {_SUBSET_MAP.keys()}. Found: {subset}")
+        folders = _SUBSET_MAP[subset]
+
+        root = os.fspath(root)
+        self._path = os.path.join(root, _ARCHIVE_NAME)
+        archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz")
+        if not os.path.isdir(self._path):
+            if not download:
+                raise RuntimeError("Dataset not found. Please use `download=True` to download")
+            if not os.path.isfile(archive):
+                download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
+            _extract_tar(archive)
+        self._fileids_paths = _get_fileids_paths(self._path, folders, self._ext_audio)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Transcript
+            int:
+                Speaker ID
+            int:
+                Chapter ID
+            int:
+                Utterance ID
+        """
+        file_path, fileid = self._fileids_paths[n]
+        metadata = _get_librispeech_metadata(fileid, self._path, file_path, self._ext_audio, self._ext_txt)
+        waveform, _ = torchaudio.load(os.path.join(self._path, metadata[0]))
+        return (waveform,) + metadata[1:]
+
+    def __len__(self) -> int:
+        return len(self._fileids_paths)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/librimix.py b/MLPY/Lib/site-packages/torchaudio/datasets/librimix.py
new file mode 100644
index 0000000000000000000000000000000000000000..d605cfa1b891d892a3658886e0df4beae42e8a01
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/librimix.py
@@ -0,0 +1,133 @@
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import torch
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import _load_waveform
+
+_TASKS_TO_MIXTURE = {
+    "sep_clean": "mix_clean",
+    "enh_single": "mix_single",
+    "enh_both": "mix_both",
+    "sep_noisy": "mix_both",
+}
+
+
+class LibriMix(Dataset):
+    r"""*LibriMix* :cite:`cosentino2020librimix` dataset.
+
+    Args:
+        root (str or Path): The path where the directory ``Libri2Mix`` or
+            ``Libri3Mix`` is stored. Not the path of those directories.
+        subset (str, optional): The subset to use. Options: [``"train-360"``, ``"train-100"``,
+            ``"dev"``, and ``"test"``] (Default: ``"train-360"``).
+        num_speakers (int, optional): The number of speakers, which determines the directories
+            to traverse. The Dataset will traverse ``s1`` to ``sN`` directories to collect
+            N source audios. (Default: 2)
+        sample_rate (int, optional): Sample rate of audio files. The ``sample_rate`` determines
+            which subdirectory the audio are fetched. If any of the audio has a different sample
+            rate, raises ``ValueError``. Options: [8000, 16000] (Default: 8000)
+        task (str, optional): The task of LibriMix.
+            Options: [``"enh_single"``, ``"enh_both"``, ``"sep_clean"``, ``"sep_noisy"``]
+            (Default: ``"sep_clean"``)
+        mode (str, optional): The mode when creating the mixture. If set to ``"min"``, the lengths of mixture
+            and sources are the minimum length of all sources. If set to ``"max"``, the lengths of mixture and
+            sources are zero padded to the maximum length of all sources.
+            Options: [``"min"``, ``"max"``]
+            (Default: ``"min"``)
+
+    Note:
+        The LibriMix dataset needs to be manually generated. Please check https://github.com/JorisCos/LibriMix
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str = "train-360",
+        num_speakers: int = 2,
+        sample_rate: int = 8000,
+        task: str = "sep_clean",
+        mode: str = "min",
+    ):
+        self.root = Path(root) / f"Libri{num_speakers}Mix"
+        if not os.path.exists(self.root):
+            raise RuntimeError(
+                f"The path {self.root} doesn't exist. "
+                "Please check the ``root`` path and ``num_speakers`` or download the dataset manually."
+            )
+        if mode not in ["max", "min"]:
+            raise ValueError(f'Expect ``mode`` to be one in ["min", "max"]. Found {mode}.')
+        if sample_rate == 8000:
+            mix_dir = self.root / "wav8k" / mode / subset
+        elif sample_rate == 16000:
+            mix_dir = self.root / "wav16k" / mode / subset
+        else:
+            raise ValueError(f"Unsupported sample rate. Found {sample_rate}.")
+        self.sample_rate = sample_rate
+        self.task = task
+
+        self.mix_dir = mix_dir / _TASKS_TO_MIXTURE[task]
+        if task == "enh_both":
+            self.src_dirs = [(mix_dir / "mix_clean")]
+        else:
+            self.src_dirs = [(mix_dir / f"s{i+1}") for i in range(num_speakers)]
+
+        self.files = [p.name for p in self.mix_dir.glob("*.wav")]
+        self.files.sort()
+
+    def _load_sample(self, key) -> Tuple[int, torch.Tensor, List[torch.Tensor]]:
+        metadata = self.get_metadata(key)
+        mixed = _load_waveform(self.root, metadata[1], metadata[0])
+        srcs = []
+        for i, path_ in enumerate(metadata[2]):
+            src = _load_waveform(self.root, path_, metadata[0])
+            if mixed.shape != src.shape:
+                raise ValueError(f"Different waveform shapes. mixed: {mixed.shape}, src[{i}]: {src.shape}")
+            srcs.append(src)
+        return self.sample_rate, mixed, srcs
+
+    def get_metadata(self, key: int) -> Tuple[int, str, List[str]]:
+        """Get metadata for the n-th sample from the dataset.
+
+        Args:
+            key (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            int:
+                Sample rate
+            str:
+                Path to mixed audio
+            List of str:
+                List of paths to source audios
+        """
+        filename = self.files[key]
+        mixed_path = os.path.relpath(self.mix_dir / filename, self.root)
+        srcs_paths = []
+        for dir_ in self.src_dirs:
+            src = os.path.relpath(dir_ / filename, self.root)
+            srcs_paths.append(src)
+        return self.sample_rate, mixed_path, srcs_paths
+
+    def __len__(self) -> int:
+        return len(self.files)
+
+    def __getitem__(self, key: int) -> Tuple[int, torch.Tensor, List[torch.Tensor]]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            key (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            int:
+                Sample rate
+            Tensor:
+                Mixture waveform
+            List of Tensors:
+                List of source waveforms
+        """
+        return self._load_sample(key)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/librispeech.py b/MLPY/Lib/site-packages/torchaudio/datasets/librispeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..50667cb5c49da6677342451dd8070c8c243d6716
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/librispeech.py
@@ -0,0 +1,174 @@
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar, _load_waveform
+
+URL = "train-clean-100"
+FOLDER_IN_ARCHIVE = "LibriSpeech"
+SAMPLE_RATE = 16000
+_DATA_SUBSETS = [
+    "dev-clean",
+    "dev-other",
+    "test-clean",
+    "test-other",
+    "train-clean-100",
+    "train-clean-360",
+    "train-other-500",
+]
+_CHECKSUMS = {
+    "http://www.openslr.org/resources/12/dev-clean.tar.gz": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3",  # noqa: E501
+    "http://www.openslr.org/resources/12/dev-other.tar.gz": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365",  # noqa: E501
+    "http://www.openslr.org/resources/12/test-clean.tar.gz": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23",  # noqa: E501
+    "http://www.openslr.org/resources/12/test-other.tar.gz": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-clean-100.tar.gz": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-clean-360.tar.gz": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-other-500.tar.gz": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2",  # noqa: E501
+}
+
+
+def _download_librispeech(root, url):
+    base_url = "http://www.openslr.org/resources/12/"
+    ext_archive = ".tar.gz"
+
+    filename = url + ext_archive
+    archive = os.path.join(root, filename)
+    download_url = os.path.join(base_url, filename)
+    if not os.path.isfile(archive):
+        checksum = _CHECKSUMS.get(download_url, None)
+        download_url_to_file(download_url, archive, hash_prefix=checksum)
+    _extract_tar(archive)
+
+
+def _get_librispeech_metadata(
+    fileid: str, root: str, folder: str, ext_audio: str, ext_txt: str
+) -> Tuple[str, int, str, int, int, int]:
+    speaker_id, chapter_id, utterance_id = fileid.split("-")
+
+    # Get audio path and sample rate
+    fileid_audio = f"{speaker_id}-{chapter_id}-{utterance_id}"
+    filepath = os.path.join(folder, speaker_id, chapter_id, f"{fileid_audio}{ext_audio}")
+
+    # Load text
+    file_text = f"{speaker_id}-{chapter_id}{ext_txt}"
+    file_text = os.path.join(root, folder, speaker_id, chapter_id, file_text)
+    with open(file_text) as ft:
+        for line in ft:
+            fileid_text, transcript = line.strip().split(" ", 1)
+            if fileid_audio == fileid_text:
+                break
+        else:
+            # Translation not found
+            raise FileNotFoundError(f"Translation not found for {fileid_audio}")
+
+    return (
+        filepath,
+        SAMPLE_RATE,
+        transcript,
+        int(speaker_id),
+        int(chapter_id),
+        int(utterance_id),
+    )
+
+
+class LIBRISPEECH(Dataset):
+    """*LibriSpeech* :cite:`7178964` dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from,
+            or the type of the dataset to dowload.
+            Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
+            ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
+            ``"train-other-500"``. (default: ``"train-clean-100"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"LibriSpeech"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    _ext_txt = ".trans.txt"
+    _ext_audio = ".flac"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+    ) -> None:
+        self._url = url
+        if url not in _DATA_SUBSETS:
+            raise ValueError(f"Invalid url '{url}' given; please provide one of {_DATA_SUBSETS}.")
+
+        root = os.fspath(root)
+        self._archive = os.path.join(root, folder_in_archive)
+        self._path = os.path.join(root, folder_in_archive, url)
+
+        if not os.path.isdir(self._path):
+            if download:
+                _download_librispeech(root, url)
+            else:
+                raise RuntimeError(
+                    f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
+                )
+
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio))
+
+    def get_metadata(self, n: int) -> Tuple[str, int, str, int, int, int]:
+        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            str:
+                Path to audio
+            int:
+                Sample rate
+            str:
+                Transcript
+            int:
+                Speaker ID
+            int:
+                Chapter ID
+            int:
+                Utterance ID
+        """
+        fileid = self._walker[n]
+        return _get_librispeech_metadata(fileid, self._archive, self._url, self._ext_audio, self._ext_txt)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Transcript
+            int:
+                Speaker ID
+            int:
+                Chapter ID
+            int:
+                Utterance ID
+        """
+        metadata = self.get_metadata(n)
+        waveform = _load_waveform(self._archive, metadata[0], metadata[1])
+        return (waveform,) + metadata[1:]
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/librispeech_biasing.py b/MLPY/Lib/site-packages/torchaudio/datasets/librispeech_biasing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c77b4215d2aa5b2df9c78546e0cd8049d628b722
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/librispeech_biasing.py
@@ -0,0 +1,189 @@
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar, _load_waveform
+
+URL = "train-clean-100"
+FOLDER_IN_ARCHIVE = "LibriSpeech"
+SAMPLE_RATE = 16000
+_DATA_SUBSETS = [
+    "dev-clean",
+    "dev-other",
+    "test-clean",
+    "test-other",
+    "train-clean-100",
+    "train-clean-360",
+    "train-other-500",
+]
+_CHECKSUMS = {
+    "http://www.openslr.org/resources/12/dev-clean.tar.gz": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3",  # noqa: E501
+    "http://www.openslr.org/resources/12/dev-other.tar.gz": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365",  # noqa: E501
+    "http://www.openslr.org/resources/12/test-clean.tar.gz": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23",  # noqa: E501
+    "http://www.openslr.org/resources/12/test-other.tar.gz": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-clean-100.tar.gz": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-clean-360.tar.gz": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-other-500.tar.gz": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2",  # noqa: E501
+}
+
+
+def _download_librispeech(root, url):
+    base_url = "http://www.openslr.org/resources/12/"
+    ext_archive = ".tar.gz"
+
+    filename = url + ext_archive
+    archive = os.path.join(root, filename)
+    download_url = os.path.join(base_url, filename)
+    if not os.path.isfile(archive):
+        checksum = _CHECKSUMS.get(download_url, None)
+        download_url_to_file(download_url, archive, hash_prefix=checksum)
+    _extract_tar(archive)
+
+
+def _get_librispeech_metadata(
+    fileid: str, root: str, folder: str, ext_audio: str, ext_txt: str, blist: List[str]
+) -> Tuple[str, int, str, int, int, int]:
+    blist = blist or []
+    speaker_id, chapter_id, utterance_id = fileid.split("-")
+
+    # Get audio path and sample rate
+    fileid_audio = f"{speaker_id}-{chapter_id}-{utterance_id}"
+    filepath = os.path.join(folder, speaker_id, chapter_id, f"{fileid_audio}{ext_audio}")
+
+    # Load text
+    file_text = f"{speaker_id}-{chapter_id}{ext_txt}"
+    file_text = os.path.join(root, folder, speaker_id, chapter_id, file_text)
+    uttblist = []
+    with open(file_text) as ft:
+        for line in ft:
+            fileid_text, transcript = line.strip().split(" ", 1)
+            if fileid_audio == fileid_text:
+                # get utterance biasing list
+                for word in transcript.split():
+                    if word in blist and word not in uttblist:
+                        uttblist.append(word)
+                break
+        else:
+            # Translation not found
+            raise FileNotFoundError(f"Translation not found for {fileid_audio}")
+
+    return (
+        filepath,
+        SAMPLE_RATE,
+        transcript,
+        int(speaker_id),
+        int(chapter_id),
+        int(utterance_id),
+        uttblist,
+    )
+
+
+class LibriSpeechBiasing(Dataset):
+    """*LibriSpeech* :cite:`7178964` dataset with prefix-tree construction and biasing support.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from,
+            or the type of the dataset to dowload.
+            Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
+            ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
+            ``"train-other-500"``. (default: ``"train-clean-100"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"LibriSpeech"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        blist (list, optional):
+            The list of biasing words (default: ``[]``).
+    """
+
+    _ext_txt = ".trans.txt"
+    _ext_audio = ".flac"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+        blist: List[str] = None,
+    ) -> None:
+        self._url = url
+        if url not in _DATA_SUBSETS:
+            raise ValueError(f"Invalid url '{url}' given; please provide one of {_DATA_SUBSETS}.")
+
+        root = os.fspath(root)
+        self._archive = os.path.join(root, folder_in_archive)
+        self._path = os.path.join(root, folder_in_archive, url)
+
+        if not os.path.isdir(self._path):
+            if download:
+                _download_librispeech(root, url)
+            else:
+                raise RuntimeError(
+                    f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
+                )
+
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio))
+        self.blist = blist
+
+    def get_metadata(self, n: int) -> Tuple[str, int, str, int, int, int]:
+        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            str:
+                Path to audio
+            int:
+                Sample rate
+            str:
+                Transcript
+            int:
+                Speaker ID
+            int:
+                Chapter ID
+            int:
+                Utterance ID
+            list:
+                List of biasing words in the utterance
+        """
+        fileid = self._walker[n]
+        return _get_librispeech_metadata(fileid, self._archive, self._url, self._ext_audio, self._ext_txt, self.blist)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Transcript
+            int:
+                Speaker ID
+            int:
+                Chapter ID
+            int:
+                Utterance ID
+            list:
+                List of biasing words in the utterance
+        """
+        metadata = self.get_metadata(n)
+        waveform = _load_waveform(self._archive, metadata[0], metadata[1])
+        return (waveform,) + metadata[1:]
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/libritts.py b/MLPY/Lib/site-packages/torchaudio/datasets/libritts.py
new file mode 100644
index 0000000000000000000000000000000000000000..773db32b8b0981e4aecdcc4ce74d5d4bdb3d29c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/libritts.py
@@ -0,0 +1,168 @@
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar
+
+URL = "train-clean-100"
+FOLDER_IN_ARCHIVE = "LibriTTS"
+_CHECKSUMS = {
+    "http://www.openslr.org/resources/60/dev-clean.tar.gz": "da0864e1bd26debed35da8a869dd5c04dfc27682921936de7cff9c8a254dbe1a",  # noqa: E501
+    "http://www.openslr.org/resources/60/dev-other.tar.gz": "d413eda26f3a152ac7c9cf3658ef85504dfb1b625296e5fa83727f5186cca79c",  # noqa: E501
+    "http://www.openslr.org/resources/60/test-clean.tar.gz": "234ea5b25859102a87024a4b9b86641f5b5aaaf1197335c95090cde04fe9a4f5",  # noqa: E501
+    "http://www.openslr.org/resources/60/test-other.tar.gz": "33a5342094f3bba7ccc2e0500b9e72d558f72eb99328ac8debe1d9080402f10d",  # noqa: E501
+    "http://www.openslr.org/resources/60/train-clean-100.tar.gz": "c5608bf1ef74bb621935382b8399c5cdd51cd3ee47cec51f00f885a64c6c7f6b",  # noqa: E501
+    "http://www.openslr.org/resources/60/train-clean-360.tar.gz": "ce7cff44dcac46009d18379f37ef36551123a1dc4e5c8e4eb73ae57260de4886",  # noqa: E501
+    "http://www.openslr.org/resources/60/train-other-500.tar.gz": "e35f7e34deeb2e2bdfe4403d88c8fdd5fbf64865cae41f027a185a6965f0a5df",  # noqa: E501
+}
+
+
+def load_libritts_item(
+    fileid: str,
+    path: str,
+    ext_audio: str,
+    ext_original_txt: str,
+    ext_normalized_txt: str,
+) -> Tuple[Tensor, int, str, str, int, int, str]:
+    speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_")
+    utterance_id = fileid
+
+    normalized_text = utterance_id + ext_normalized_txt
+    normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text)
+
+    original_text = utterance_id + ext_original_txt
+    original_text = os.path.join(path, speaker_id, chapter_id, original_text)
+
+    file_audio = utterance_id + ext_audio
+    file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)
+
+    # Load audio
+    waveform, sample_rate = torchaudio.load(file_audio)
+
+    # Load original text
+    with open(original_text) as ft:
+        original_text = ft.readline()
+
+    # Load normalized text
+    with open(normalized_text, "r") as ft:
+        normalized_text = ft.readline()
+
+    return (
+        waveform,
+        sample_rate,
+        original_text,
+        normalized_text,
+        int(speaker_id),
+        int(chapter_id),
+        utterance_id,
+    )
+
+
+class LIBRITTS(Dataset):
+    """*LibriTTS* :cite:`Zen2019LibriTTSAC` dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from,
+            or the type of the dataset to dowload.
+            Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
+            ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
+            ``"train-other-500"``. (default: ``"train-clean-100"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"LibriTTS"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    _ext_original_txt = ".original.txt"
+    _ext_normalized_txt = ".normalized.txt"
+    _ext_audio = ".wav"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+    ) -> None:
+
+        if url in [
+            "dev-clean",
+            "dev-other",
+            "test-clean",
+            "test-other",
+            "train-clean-100",
+            "train-clean-360",
+            "train-other-500",
+        ]:
+
+            ext_archive = ".tar.gz"
+            base_url = "http://www.openslr.org/resources/60/"
+
+            url = os.path.join(base_url, url + ext_archive)
+
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+
+        basename = os.path.basename(url)
+        archive = os.path.join(root, basename)
+
+        basename = basename.split(".")[0]
+        folder_in_archive = os.path.join(folder_in_archive, basename)
+
+        self._path = os.path.join(root, folder_in_archive)
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                _extract_tar(archive)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio))
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Original text
+            str:
+                Normalized text
+            int:
+                Speaker ID
+            int:
+                Chapter ID
+            str:
+                Utterance ID
+        """
+        fileid = self._walker[n]
+        return load_libritts_item(
+            fileid,
+            self._path,
+            self._ext_audio,
+            self._ext_original_txt,
+            self._ext_normalized_txt,
+        )
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/ljspeech.py b/MLPY/Lib/site-packages/torchaudio/datasets/ljspeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..04878d6cc1285094a43c3ce3af0e46a19c2ae04e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/ljspeech.py
@@ -0,0 +1,107 @@
+import csv
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar
+
+
+_RELEASE_CONFIGS = {
+    "release1": {
+        "folder_in_archive": "wavs",
+        "url": "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",
+        "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5",
+    }
+}
+
+
+class LJSPEECH(Dataset):
+    """*LJSpeech-1.1* :cite:`ljspeech17` dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from.
+            (default: ``"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"wavs"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = _RELEASE_CONFIGS["release1"]["url"],
+        folder_in_archive: str = _RELEASE_CONFIGS["release1"]["folder_in_archive"],
+        download: bool = False,
+    ) -> None:
+
+        self._parse_filesystem(root, url, folder_in_archive, download)
+
+    def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None:
+        root = Path(root)
+
+        basename = os.path.basename(url)
+        archive = root / basename
+
+        basename = Path(basename.split(".tar.bz2")[0])
+        folder_in_archive = basename / folder_in_archive
+
+        self._path = root / folder_in_archive
+        self._metadata_path = root / basename / "metadata.csv"
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _RELEASE_CONFIGS["release1"]["checksum"]
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                _extract_tar(archive)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+
+        with open(self._metadata_path, "r", newline="") as metadata:
+            flist = csv.reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE)
+            self._flist = list(flist)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Transcript
+            str:
+                Normalized Transcript
+        """
+        line = self._flist[n]
+        fileid, transcript, normalized_transcript = line
+        fileid_audio = self._path / (fileid + ".wav")
+
+        # Load audio
+        waveform, sample_rate = torchaudio.load(fileid_audio)
+
+        return (
+            waveform,
+            sample_rate,
+            transcript,
+            normalized_transcript,
+        )
+
+    def __len__(self) -> int:
+        return len(self._flist)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/musdb_hq.py b/MLPY/Lib/site-packages/torchaudio/datasets/musdb_hq.py
new file mode 100644
index 0000000000000000000000000000000000000000..fca108c51825869cb29a852cae210d6ca88f6efa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/musdb_hq.py
@@ -0,0 +1,139 @@
+import os
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torchaudio
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_zip
+
+_URL = "https://zenodo.org/record/3338373/files/musdb18hq.zip"
+_CHECKSUM = "baac80d0483c61d74b2e5f3be75fa557eec52898339e6aa45c1fa48833c5d21d"
+_EXT = ".wav"
+_SAMPLE_RATE = 44100
+_VALIDATION_SET = [
+    "Actions - One Minute Smile",
+    "Clara Berry And Wooldog - Waltz For My Victims",
+    "Johnny Lokke - Promises & Lies",
+    "Patrick Talbot - A Reason To Leave",
+    "Triviul - Angelsaint",
+    "Alexander Ross - Goodbye Bolero",
+    "Fergessen - Nos Palpitants",
+    "Leaf - Summerghost",
+    "Skelpolu - Human Mistakes",
+    "Young Griffo - Pennies",
+    "ANiMAL - Rockshow",
+    "James May - On The Line",
+    "Meaxic - Take A Step",
+    "Traffic Experiment - Sirens",
+]
+
+
+class MUSDB_HQ(Dataset):
+    """*MUSDB_HQ* :cite:`MUSDB18HQ` dataset.
+
+    Args:
+        root (str or Path): Root directory where the dataset's top level directory is found
+        subset (str): Subset of the dataset to use. Options: [``"train"``, ``"test"``].
+        sources (List[str] or None, optional): Sources extract data from.
+            List can contain the following options: [``"bass"``, ``"drums"``, ``"other"``, ``"mixture"``, ``"vocals"``].
+            If ``None``, dataset consists of tracks except mixture.
+            (default: ``None``)
+        split (str or None, optional): Whether to split training set into train and validation set.
+            If ``None``, no splitting occurs. If ``train`` or ``validation``, returns respective set.
+            (default: ``None``)
+        download (bool, optional): Whether to download the dataset if it is not found at root path.
+            (default: ``False``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str,
+        sources: Optional[List[str]] = None,
+        split: Optional[str] = None,
+        download: bool = False,
+    ) -> None:
+        self.sources = ["bass", "drums", "other", "vocals"] if not sources else sources
+        self.split = split
+
+        basename = os.path.basename(_URL)
+        archive = os.path.join(root, basename)
+        basename = basename.rsplit(".", 2)[0]
+
+        if subset not in ["test", "train"]:
+            raise ValueError("`subset` must be one of ['test', 'train']")
+        if self.split is not None and self.split not in ["train", "validation"]:
+            raise ValueError("`split` must be one of ['train', 'validation']")
+        base_path = os.path.join(root, basename)
+        self._path = os.path.join(base_path, subset)
+        if not os.path.isdir(self._path):
+            if not os.path.isfile(archive):
+                if not download:
+                    raise RuntimeError("Dataset not found. Please use `download=True` to download")
+                download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
+            os.makedirs(base_path, exist_ok=True)
+            _extract_zip(archive, base_path)
+
+        self.names = self._collect_songs()
+
+    def _get_track(self, name, source):
+        return Path(self._path) / name / f"{source}{_EXT}"
+
+    def _load_sample(self, n: int) -> Tuple[torch.Tensor, int, int, str]:
+        name = self.names[n]
+        wavs = []
+        num_frames = None
+        for source in self.sources:
+            track = self._get_track(name, source)
+            wav, sr = torchaudio.load(str(track))
+            if sr != _SAMPLE_RATE:
+                raise ValueError(f"expected sample rate {_SAMPLE_RATE}, but got {sr}")
+            if num_frames is None:
+                num_frames = wav.shape[-1]
+            else:
+                if wav.shape[-1] != num_frames:
+                    raise ValueError("num_frames do not match across sources")
+            wavs.append(wav)
+
+        stacked = torch.stack(wavs)
+
+        return stacked, _SAMPLE_RATE, num_frames, name
+
+    def _collect_songs(self):
+        if self.split == "validation":
+            return _VALIDATION_SET
+        path = Path(self._path)
+        names = []
+        for root, folders, _ in os.walk(path, followlinks=True):
+            root = Path(root)
+            if root.name.startswith(".") or folders or root == path:
+                continue
+            name = str(root.relative_to(path))
+            if self.split and name in _VALIDATION_SET:
+                continue
+            names.append(name)
+        return sorted(names)
+
+    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, int, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            int:
+                Num frames
+            str:
+                Track name
+        """
+        return self._load_sample(n)
+
+    def __len__(self) -> int:
+        return len(self.names)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/quesst14.py b/MLPY/Lib/site-packages/torchaudio/datasets/quesst14.py
new file mode 100644
index 0000000000000000000000000000000000000000..74ea3923ed0ba2d18608a431ee4aa3f60ff34b99
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/quesst14.py
@@ -0,0 +1,136 @@
+import os
+import re
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import torch
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar, _load_waveform
+
+
+URL = "https://speech.fit.vutbr.cz/files/quesst14Database.tgz"
+SAMPLE_RATE = 8000
+_CHECKSUM = "4f869e06bc066bbe9c5dde31dbd3909a0870d70291110ebbb38878dcbc2fc5e4"
+_LANGUAGES = [
+    "albanian",
+    "basque",
+    "czech",
+    "nnenglish",
+    "romanian",
+    "slovak",
+]
+
+
+class QUESST14(Dataset):
+    """*QUESST14* :cite:`Mir2015QUESST2014EQ` dataset.
+
+    Args:
+        root (str or Path): Root directory where the dataset's top level directory is found
+        subset (str): Subset of the dataset to use. Options: [``"docs"``, ``"dev"``, ``"eval"``].
+        language (str or None, optional): Language to get dataset for.
+            Options: [``None``, ``albanian``, ``basque``, ``czech``, ``nnenglish``, ``romanian``, ``slovak``].
+            If ``None``, dataset consists of all languages. (default: ``"nnenglish"``)
+        download (bool, optional): Whether to download the dataset if it is not found at root path.
+            (default: ``False``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str,
+        language: Optional[str] = "nnenglish",
+        download: bool = False,
+    ) -> None:
+        if subset not in ["docs", "dev", "eval"]:
+            raise ValueError("`subset` must be one of ['docs', 'dev', 'eval']")
+
+        if language is not None and language not in _LANGUAGES:
+            raise ValueError(f"`language` must be None or one of {str(_LANGUAGES)}")
+
+        # Get string representation of 'root'
+        root = os.fspath(root)
+
+        basename = os.path.basename(URL)
+        archive = os.path.join(root, basename)
+
+        basename = basename.rsplit(".", 2)[0]
+        self._path = os.path.join(root, basename)
+
+        if not os.path.isdir(self._path):
+            if not os.path.isfile(archive):
+                if not download:
+                    raise RuntimeError("Dataset not found. Please use `download=True` to download")
+                download_url_to_file(URL, archive, hash_prefix=_CHECKSUM)
+            _extract_tar(archive, root)
+
+        if subset == "docs":
+            self.data = filter_audio_paths(self._path, language, "language_key_utterances.lst")
+        elif subset == "dev":
+            self.data = filter_audio_paths(self._path, language, "language_key_dev.lst")
+        elif subset == "eval":
+            self.data = filter_audio_paths(self._path, language, "language_key_eval.lst")
+
+    def get_metadata(self, n: int) -> Tuple[str, int, str]:
+        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            str:
+                Path to audio
+            int:
+                Sample rate
+            str:
+                File name
+        """
+        audio_path = self.data[n]
+        relpath = os.path.relpath(audio_path, self._path)
+        return relpath, SAMPLE_RATE, audio_path.with_suffix("").name
+
+    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                File name
+        """
+        metadata = self.get_metadata(n)
+        waveform = _load_waveform(self._path, metadata[0], metadata[1])
+        return (waveform,) + metadata[1:]
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+
+def filter_audio_paths(
+    path: str,
+    language: str,
+    lst_name: str,
+):
+    """Extract audio paths for the given language."""
+    audio_paths = []
+
+    path = Path(path)
+    with open(path / "scoring" / lst_name) as f:
+        for line in f:
+            audio_path, lang = line.strip().split()
+            if language is not None and lang != language:
+                continue
+            audio_path = re.sub(r"^.*?\/", "", audio_path)
+            audio_paths.append(path / audio_path)
+
+    return audio_paths
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/snips.py b/MLPY/Lib/site-packages/torchaudio/datasets/snips.py
new file mode 100644
index 0000000000000000000000000000000000000000..604510cc3c9221e8d33f99d786db2ac6b26253f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/snips.py
@@ -0,0 +1,157 @@
+import os
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import _load_waveform
+
+
+_SAMPLE_RATE = 16000
+_SPEAKERS = [
+    "Aditi",
+    "Amy",
+    "Brian",
+    "Emma",
+    "Geraint",
+    "Ivy",
+    "Joanna",
+    "Joey",
+    "Justin",
+    "Kendra",
+    "Kimberly",
+    "Matthew",
+    "Nicole",
+    "Raveena",
+    "Russell",
+    "Salli",
+]
+
+
+def _load_labels(file: Path, subset: str):
+    """Load transcirpt, iob, and intent labels for all utterances.
+
+    Args:
+        file (Path): The path to the label file.
+        subset (str): Subset of the dataset to use. Options: [``"train"``, ``"valid"``, ``"test"``].
+
+    Returns:
+        Dictionary of labels, where the key is the filename of the audio,
+            and the label is a Tuple of transcript, Inside–outside–beginning (IOB) label, and intention label.
+    """
+    labels = {}
+    with open(file, "r") as f:
+        for line in f:
+            line = line.strip().split(" ")
+            index = line[0]
+            trans, iob_intent = " ".join(line[1:]).split("\t")
+            trans = " ".join(trans.split(" ")[1:-1])
+            iob = " ".join(iob_intent.split(" ")[1:-1])
+            intent = iob_intent.split(" ")[-1]
+            if subset in index:
+                labels[index] = (trans, iob, intent)
+    return labels
+
+
+class Snips(Dataset):
+    """*Snips* :cite:`coucke2018snips` dataset.
+
+    Args:
+        root (str or Path): Root directory where the dataset's top level directory is found.
+        subset (str): Subset of the dataset to use. Options: [``"train"``, ``"valid"``, ``"test"``].
+        speakers (List[str] or None, optional): The speaker list to include in the dataset. If ``None``,
+            include all speakers in the subset. (Default: ``None``)
+        audio_format (str, optional): The extension of the audios. Options: [``"mp3"``, ``"wav"``].
+            (Default: ``"mp3"``)
+    """
+
+    _trans_file = "all.iob.snips.txt"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str,
+        speakers: Optional[List[str]] = None,
+        audio_format: str = "mp3",
+    ) -> None:
+        if subset not in ["train", "valid", "test"]:
+            raise ValueError('`subset` must be one of ["train", "valid", "test"].')
+        if audio_format not in ["mp3", "wav"]:
+            raise ValueError('`audio_format` must be one of ["mp3", "wav].')
+
+        root = Path(root)
+        self._path = root / "SNIPS"
+        self.audio_path = self._path / subset
+        if speakers is None:
+            speakers = _SPEAKERS
+
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found.")
+
+        self.audio_paths = self.audio_path.glob(f"*.{audio_format}")
+        self.data = []
+        for audio_path in sorted(self.audio_paths):
+            audio_name = str(audio_path.name)
+            speaker = audio_name.split("-")[0]
+            if speaker in speakers:
+                self.data.append(audio_path)
+        transcript_path = self._path / self._trans_file
+        self.labels = _load_labels(transcript_path, subset)
+
+    def get_metadata(self, n: int) -> Tuple[str, int, str, str, str]:
+        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+
+        Args:
+            n (int): The index of the sample to be loaded.
+
+        Returns:
+            Tuple of the following items:
+
+            str:
+                Path to audio
+            int:
+                Sample rate
+            str:
+                File name
+            str:
+                Transcription of audio
+            str:
+                Inside–outside–beginning (IOB) label of transcription
+            str:
+                Intention label of the audio.
+        """
+        audio_path = self.data[n]
+        relpath = os.path.relpath(audio_path, self._path)
+        file_name = audio_path.with_suffix("").name
+        transcript, iob, intent = self.labels[file_name]
+        return relpath, _SAMPLE_RATE, file_name, transcript, iob, intent
+
+    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items:
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                File name
+            str:
+                Transcription of audio
+            str:
+                Inside–outside–beginning (IOB) label of transcription
+            str:
+                Intention label of the audio.
+        """
+        metadata = self.get_metadata(n)
+        waveform = _load_waveform(self._path, metadata[0], metadata[1])
+        return (waveform,) + metadata[1:]
+
+    def __len__(self) -> int:
+        return len(self.data)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/speechcommands.py b/MLPY/Lib/site-packages/torchaudio/datasets/speechcommands.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf5f2d4d0bf18894a1969f3d58d4448f5a182919
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/speechcommands.py
@@ -0,0 +1,183 @@
+import os
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar, _load_waveform
+
+FOLDER_IN_ARCHIVE = "SpeechCommands"
+URL = "speech_commands_v0.02"
+HASH_DIVIDER = "_nohash_"
+EXCEPT_FOLDER = "_background_noise_"
+SAMPLE_RATE = 16000
+_CHECKSUMS = {
+    "http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz": "743935421bb51cccdb6bdd152e04c5c70274e935c82119ad7faeec31780d811d",  # noqa: E501
+    "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz": "af14739ee7dc311471de98f5f9d2c9191b18aedfe957f4a6ff791c709868ff58",  # noqa: E501
+}
+
+
+def _load_list(root, *filenames):
+    output = []
+    for filename in filenames:
+        filepath = os.path.join(root, filename)
+        with open(filepath) as fileobj:
+            output += [os.path.normpath(os.path.join(root, line.strip())) for line in fileobj]
+    return output
+
+
+def _get_speechcommands_metadata(filepath: str, path: str) -> Tuple[str, int, str, str, int]:
+    relpath = os.path.relpath(filepath, path)
+    reldir, filename = os.path.split(relpath)
+    _, label = os.path.split(reldir)
+    # Besides the officially supported split method for datasets defined by "validation_list.txt"
+    # and "testing_list.txt" over "speech_commands_v0.0x.tar.gz" archives, an alternative split
+    # method referred to in paragraph 2-3 of Section 7.1, references 13 and 14 of the original
+    # paper, and the checksums file from the tensorflow_datasets package [1] is also supported.
+    # Some filenames in those "speech_commands_test_set_v0.0x.tar.gz" archives have the form
+    # "xxx.wav.wav", so file extensions twice needs to be stripped twice.
+    # [1] https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/url_checksums/speech_commands.txt
+    speaker, _ = os.path.splitext(filename)
+    speaker, _ = os.path.splitext(speaker)
+
+    speaker_id, utterance_number = speaker.split(HASH_DIVIDER)
+    utterance_number = int(utterance_number)
+
+    return relpath, SAMPLE_RATE, label, speaker_id, utterance_number
+
+
+class SPEECHCOMMANDS(Dataset):
+    """*Speech Commands* :cite:`speechcommandsv2` dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from,
+            or the type of the dataset to dowload.
+            Allowed type values are ``"speech_commands_v0.01"`` and ``"speech_commands_v0.02"``
+            (default: ``"speech_commands_v0.02"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"SpeechCommands"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        subset (str or None, optional):
+            Select a subset of the dataset [None, "training", "validation", "testing"]. None means
+            the whole dataset. "validation" and "testing" are defined in "validation_list.txt" and
+            "testing_list.txt", respectively, and "training" is the rest. Details for the files
+            "validation_list.txt" and "testing_list.txt" are explained in the README of the dataset
+            and in the introduction of Section 7 of the original paper and its reference 12. The
+            original paper can be found `here <https://arxiv.org/pdf/1804.03209.pdf>`_. (Default: ``None``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+        subset: Optional[str] = None,
+    ) -> None:
+
+        if subset is not None and subset not in ["training", "validation", "testing"]:
+            raise ValueError("When `subset` is not None, it must be one of ['training', 'validation', 'testing'].")
+
+        if url in [
+            "speech_commands_v0.01",
+            "speech_commands_v0.02",
+        ]:
+            base_url = "http://download.tensorflow.org/data/"
+            ext_archive = ".tar.gz"
+
+            url = os.path.join(base_url, url + ext_archive)
+
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+        self._archive = os.path.join(root, folder_in_archive)
+
+        basename = os.path.basename(url)
+        archive = os.path.join(root, basename)
+
+        basename = basename.rsplit(".", 2)[0]
+        folder_in_archive = os.path.join(folder_in_archive, basename)
+
+        self._path = os.path.join(root, folder_in_archive)
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                _extract_tar(archive, self._path)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+
+        if subset == "validation":
+            self._walker = _load_list(self._path, "validation_list.txt")
+        elif subset == "testing":
+            self._walker = _load_list(self._path, "testing_list.txt")
+        elif subset == "training":
+            excludes = set(_load_list(self._path, "validation_list.txt", "testing_list.txt"))
+            walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav"))
+            self._walker = [
+                w
+                for w in walker
+                if HASH_DIVIDER in w and EXCEPT_FOLDER not in w and os.path.normpath(w) not in excludes
+            ]
+        else:
+            walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav"))
+            self._walker = [w for w in walker if HASH_DIVIDER in w and EXCEPT_FOLDER not in w]
+
+    def get_metadata(self, n: int) -> Tuple[str, int, str, str, int]:
+        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            str:
+                Path to the audio
+            int:
+                Sample rate
+            str:
+                Label
+            str:
+                Speaker ID
+            int:
+                Utterance number
+        """
+        fileid = self._walker[n]
+        return _get_speechcommands_metadata(fileid, self._archive)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Label
+            str:
+                Speaker ID
+            int:
+                Utterance number
+        """
+        metadata = self.get_metadata(n)
+        waveform = _load_waveform(self._archive, metadata[0], metadata[1])
+        return (waveform,) + metadata[1:]
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/tedlium.py b/MLPY/Lib/site-packages/torchaudio/datasets/tedlium.py
new file mode 100644
index 0000000000000000000000000000000000000000..51b50f3bc784cabb4c4b12fdfce0f6fdeafa2aa0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/tedlium.py
@@ -0,0 +1,218 @@
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar
+
+
+_RELEASE_CONFIGS = {
+    "release1": {
+        "folder_in_archive": "TEDLIUM_release1",
+        "url": "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz",
+        "checksum": "30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27",
+        "data_path": "",
+        "subset": "train",
+        "supported_subsets": ["train", "test", "dev"],
+        "dict": "TEDLIUM.150K.dic",
+    },
+    "release2": {
+        "folder_in_archive": "TEDLIUM_release2",
+        "url": "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz",
+        "checksum": "93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58",
+        "data_path": "",
+        "subset": "train",
+        "supported_subsets": ["train", "test", "dev"],
+        "dict": "TEDLIUM.152k.dic",
+    },
+    "release3": {
+        "folder_in_archive": "TEDLIUM_release-3",
+        "url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz",
+        "checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb",
+        "data_path": "data/",
+        "subset": "train",
+        "supported_subsets": ["train", "test", "dev"],
+        "dict": "TEDLIUM.152k.dic",
+    },
+}
+
+
+class TEDLIUM(Dataset):
+    """*Tedlium* :cite:`rousseau2012tedlium` dataset (releases 1,2 and 3).
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        release (str, optional): Release version.
+            Allowed values are ``"release1"``, ``"release2"`` or ``"release3"``.
+            (default: ``"release1"``).
+        subset (str, optional): The subset of dataset to use. Valid options are ``"train"``, ``"dev"``,
+            and ``"test"``. Defaults to ``"train"``.
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        audio_ext (str, optional): extension for audio file (default: ``".sph"``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        release: str = "release1",
+        subset: str = "train",
+        download: bool = False,
+        audio_ext: str = ".sph",
+    ) -> None:
+        self._ext_audio = audio_ext
+        if release in _RELEASE_CONFIGS.keys():
+            folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"]
+            url = _RELEASE_CONFIGS[release]["url"]
+            subset = subset if subset else _RELEASE_CONFIGS[release]["subset"]
+        else:
+            # Raise warning
+            raise RuntimeError(
+                "The release {} does not match any of the supported tedlium releases{} ".format(
+                    release,
+                    _RELEASE_CONFIGS.keys(),
+                )
+            )
+        if subset not in _RELEASE_CONFIGS[release]["supported_subsets"]:
+            # Raise warning
+            raise RuntimeError(
+                "The subset {} does not match any of the supported tedlium subsets{} ".format(
+                    subset,
+                    _RELEASE_CONFIGS[release]["supported_subsets"],
+                )
+            )
+
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+
+        basename = os.path.basename(url)
+        archive = os.path.join(root, basename)
+
+        basename = basename.split(".")[0]
+
+        if release == "release3":
+            if subset == "train":
+                self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"])
+            else:
+                self._path = os.path.join(root, folder_in_archive, "legacy", subset)
+        else:
+            self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"], subset)
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _RELEASE_CONFIGS[release]["checksum"]
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                _extract_tar(archive)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+
+        # Create list for all samples
+        self._filelist = []
+        stm_path = os.path.join(self._path, "stm")
+        for file in sorted(os.listdir(stm_path)):
+            if file.endswith(".stm"):
+                stm_path = os.path.join(self._path, "stm", file)
+                with open(stm_path) as f:
+                    l = len(f.readlines())
+                    file = file.replace(".stm", "")
+                    self._filelist.extend((file, line) for line in range(l))
+        # Create dict path for later read
+        self._dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"])
+        self._phoneme_dict = None
+
+    def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]:
+        """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name.
+
+        Args:
+            fileid (str): File id to identify both text and audio files corresponding to the sample
+            line (int): Line identifier for the sample inside the text file
+            path (str): Dataset root path
+
+        Returns:
+            (Tensor, int, str, int, int, int):
+            ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
+        """
+        transcript_path = os.path.join(path, "stm", fileid)
+        with open(transcript_path + ".stm") as f:
+            transcript = f.readlines()[line]
+            talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6)
+
+        wave_path = os.path.join(path, "sph", fileid)
+        waveform, sample_rate = self._load_audio(wave_path + self._ext_audio, start_time=start_time, end_time=end_time)
+
+        return (waveform, sample_rate, transcript, talk_id, speaker_id, identifier)
+
+    def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]:
+        """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality
+        and load individual sentences from a full ted audio talk file.
+
+        Args:
+            path (str): Path to audio file
+            start_time (int): Time in seconds where the sample sentence stars
+            end_time (int): Time in seconds where the sample sentence finishes
+            sample_rate (float, optional): Sampling rate
+
+        Returns:
+            [Tensor, int]: Audio tensor representation and sample rate
+        """
+        start_time = int(float(start_time) * sample_rate)
+        end_time = int(float(end_time) * sample_rate)
+
+        kwargs = {"frame_offset": start_time, "num_frames": end_time - start_time}
+
+        return torchaudio.load(path, **kwargs)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Transcript
+            int:
+                Talk ID
+            int:
+                Speaker ID
+            int:
+                Identifier
+        """
+        fileid, line = self._filelist[n]
+        return self._load_tedlium_item(fileid, line, self._path)
+
+    def __len__(self) -> int:
+        """TEDLIUM dataset custom function overwritting len default behaviour.
+
+        Returns:
+            int: TEDLIUM dataset length
+        """
+        return len(self._filelist)
+
+    @property
+    def phoneme_dict(self):
+        """dict[str, tuple[str]]: Phonemes. Mapping from word to tuple of phonemes.
+        Note that some words have empty phonemes.
+        """
+        # Read phoneme dictionary
+        if not self._phoneme_dict:
+            self._phoneme_dict = {}
+            with open(self._dict_path, "r", encoding="utf-8") as f:
+                for line in f.readlines():
+                    content = line.strip().split()
+                    self._phoneme_dict[content[0]] = tuple(content[1:])  # content[1:] can be empty list
+        return self._phoneme_dict.copy()
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/utils.py b/MLPY/Lib/site-packages/torchaudio/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..462415b422c4afcff4954d6c2fe537b5bc05a335
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/utils.py
@@ -0,0 +1,54 @@
+import logging
+import os
+import tarfile
+import zipfile
+from typing import Any, List, Optional
+
+import torchaudio
+
+_LG = logging.getLogger(__name__)
+
+
+def _extract_tar(from_path: str, to_path: Optional[str] = None, overwrite: bool = False) -> List[str]:
+    if to_path is None:
+        to_path = os.path.dirname(from_path)
+    with tarfile.open(from_path, "r") as tar:
+        files = []
+        for file_ in tar:  # type: Any
+            file_path = os.path.join(to_path, file_.name)
+            if file_.isfile():
+                files.append(file_path)
+                if os.path.exists(file_path):
+                    _LG.info("%s already extracted.", file_path)
+                    if not overwrite:
+                        continue
+            tar.extract(file_, to_path)
+        return files
+
+
+def _extract_zip(from_path: str, to_path: Optional[str] = None, overwrite: bool = False) -> List[str]:
+    if to_path is None:
+        to_path = os.path.dirname(from_path)
+
+    with zipfile.ZipFile(from_path, "r") as zfile:
+        files = zfile.namelist()
+        for file_ in files:
+            file_path = os.path.join(to_path, file_)
+            if os.path.exists(file_path):
+                _LG.info("%s already extracted.", file_path)
+                if not overwrite:
+                    continue
+            zfile.extract(file_, to_path)
+    return files
+
+
+def _load_waveform(
+    root: str,
+    filename: str,
+    exp_sample_rate: int,
+):
+    path = os.path.join(root, filename)
+    waveform, sample_rate = torchaudio.load(path)
+    if exp_sample_rate != sample_rate:
+        raise ValueError(f"sample rate should be {exp_sample_rate}, but got {sample_rate}")
+    return waveform
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/vctk.py b/MLPY/Lib/site-packages/torchaudio/datasets/vctk.py
new file mode 100644
index 0000000000000000000000000000000000000000..cba6cddecf9fd5b8b7af03df436b16c0714ca588
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/vctk.py
@@ -0,0 +1,143 @@
+import os
+from typing import Tuple
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_zip
+
+URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
+_CHECKSUMS = {
+    "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip": "f96258be9fdc2cbff6559541aae7ea4f59df3fcaf5cf963aae5ca647357e359c"  # noqa: E501
+}
+
+
+SampleType = Tuple[Tensor, int, str, str, str]
+
+
+class VCTK_092(Dataset):
+    """*VCTK 0.92* :cite:`yamagishi2019vctk` dataset
+
+    Args:
+        root (str): Root directory where the dataset's top level directory is found.
+        mic_id (str, optional): Microphone ID. Either ``"mic1"`` or ``"mic2"``. (default: ``"mic2"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        url (str, optional): The URL to download the dataset from.
+            (default: ``"https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"``)
+        audio_ext (str, optional): Custom audio extension if dataset is converted to non-default audio format.
+
+    Note:
+        * All the speeches from speaker ``p315`` will be skipped due to the lack of the corresponding text files.
+        * All the speeches from ``p280`` will be skipped for ``mic_id="mic2"`` due to the lack of the audio files.
+        * Some of the speeches from speaker ``p362`` will be skipped due to the lack of  the audio files.
+        * See Also: https://datashare.is.ed.ac.uk/handle/10283/3443
+    """
+
+    def __init__(
+        self,
+        root: str,
+        mic_id: str = "mic2",
+        download: bool = False,
+        url: str = URL,
+        audio_ext=".flac",
+    ):
+        if mic_id not in ["mic1", "mic2"]:
+            raise RuntimeError(f'`mic_id` has to be either "mic1" or "mic2". Found: {mic_id}')
+
+        archive = os.path.join(root, "VCTK-Corpus-0.92.zip")
+
+        self._path = os.path.join(root, "VCTK-Corpus-0.92")
+        self._txt_dir = os.path.join(self._path, "txt")
+        self._audio_dir = os.path.join(self._path, "wav48_silence_trimmed")
+        self._mic_id = mic_id
+        self._audio_ext = audio_ext
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                _extract_zip(archive, self._path)
+
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
+
+        # Extracting speaker IDs from the folder structure
+        self._speaker_ids = sorted(os.listdir(self._txt_dir))
+        self._sample_ids = []
+
+        """
+        Due to some insufficient data complexity in the 0.92 version of this dataset,
+        we start traversing the audio folder structure in accordance with the text folder.
+        As some of the audio files are missing of either ``mic_1`` or ``mic_2`` but the
+        text is present for the same, we first check for the existence of the audio file
+        before adding it to the ``sample_ids`` list.
+
+        Once the ``audio_ids`` are loaded into memory we can quickly access the list for
+        different parameters required by the user.
+        """
+        for speaker_id in self._speaker_ids:
+            if speaker_id == "p280" and mic_id == "mic2":
+                continue
+            utterance_dir = os.path.join(self._txt_dir, speaker_id)
+            for utterance_file in sorted(f for f in os.listdir(utterance_dir) if f.endswith(".txt")):
+                utterance_id = os.path.splitext(utterance_file)[0]
+                audio_path_mic = os.path.join(
+                    self._audio_dir,
+                    speaker_id,
+                    f"{utterance_id}_{mic_id}{self._audio_ext}",
+                )
+                if speaker_id == "p362" and not os.path.isfile(audio_path_mic):
+                    continue
+                self._sample_ids.append(utterance_id.split("_"))
+
+    def _load_text(self, file_path) -> str:
+        with open(file_path) as file_path:
+            return file_path.readlines()[0]
+
+    def _load_audio(self, file_path) -> Tuple[Tensor, int]:
+        return torchaudio.load(file_path)
+
+    def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> SampleType:
+        transcript_path = os.path.join(self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt")
+        audio_path = os.path.join(
+            self._audio_dir,
+            speaker_id,
+            f"{speaker_id}_{utterance_id}_{mic_id}{self._audio_ext}",
+        )
+
+        # Reading text
+        transcript = self._load_text(transcript_path)
+
+        # Reading FLAC
+        waveform, sample_rate = self._load_audio(audio_path)
+
+        return (waveform, sample_rate, transcript, speaker_id, utterance_id)
+
+    def __getitem__(self, n: int) -> SampleType:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Transcript
+            str:
+                Speaker ID
+            std:
+                Utterance ID
+        """
+        speaker_id, utterance_id = self._sample_ids[n]
+        return self._load_sample(speaker_id, utterance_id, self._mic_id)
+
+    def __len__(self) -> int:
+        return len(self._sample_ids)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/voxceleb1.py b/MLPY/Lib/site-packages/torchaudio/datasets/voxceleb1.py
new file mode 100644
index 0000000000000000000000000000000000000000..27af033a1f17dd09378466d2f54d7e615940563e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/voxceleb1.py
@@ -0,0 +1,309 @@
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_zip, _load_waveform
+
+
+SAMPLE_RATE = 16000
+_ARCHIVE_CONFIGS = {
+    "dev": {
+        "archive_name": "vox1_dev_wav.zip",
+        "urls": [
+            "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
+            "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
+            "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
+            "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
+        ],
+        "checksums": [
+            "21ec6ca843659ebc2fdbe04b530baa4f191ad4b0971912672d92c158f32226a0",
+            "311d21e0c8cbf33573a4fce6c80e5a279d80736274b381c394319fc557159a04",
+            "92b64465f2b2a3dc0e4196ae8dd6828cbe9ddd1f089419a11e4cbfe2e1750df0",
+            "00e6190c770b27f27d2a3dd26ee15596b17066b715ac111906861a7d09a211a5",
+        ],
+    },
+    "test": {
+        "archive_name": "vox1_test_wav.zip",
+        "url": "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip",
+        "checksum": "8de57f347fe22b2c24526e9f444f689ecf5096fc2a92018cf420ff6b5b15eaea",
+    },
+}
+_IDEN_SPLIT_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txt"
+_VERI_TEST_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt"
+
+
+def _download_extract_wavs(root: str):
+    for archive in ["dev", "test"]:
+        archive_name = _ARCHIVE_CONFIGS[archive]["archive_name"]
+        archive_path = os.path.join(root, archive_name)
+        # The zip file of dev data is splited to 4 chunks.
+        # Download and combine them into one file before extraction.
+        if archive == "dev":
+            urls = _ARCHIVE_CONFIGS[archive]["urls"]
+            checksums = _ARCHIVE_CONFIGS[archive]["checksums"]
+            with open(archive_path, "wb") as f:
+                for url, checksum in zip(urls, checksums):
+                    file_path = os.path.join(root, os.path.basename(url))
+                    download_url_to_file(url, file_path, hash_prefix=checksum)
+                    with open(file_path, "rb") as f_split:
+                        f.write(f_split.read())
+        else:
+            url = _ARCHIVE_CONFIGS[archive]["url"]
+            checksum = _ARCHIVE_CONFIGS[archive]["checksum"]
+            download_url_to_file(url, archive_path, hash_prefix=checksum)
+        _extract_zip(archive_path)
+
+
+def _get_flist(root: str, file_path: str, subset: str) -> List[str]:
+    f_list = []
+    if subset == "train":
+        index = 1
+    elif subset == "dev":
+        index = 2
+    else:
+        index = 3
+    with open(file_path, "r") as f:
+        for line in f:
+            id, path = line.split()
+            if int(id) == index:
+                f_list.append(path)
+    return sorted(f_list)
+
+
+def _get_paired_flist(root: str, veri_test_path: str):
+    f_list = []
+    with open(veri_test_path, "r") as f:
+        for line in f:
+            label, path1, path2 = line.split()
+            f_list.append((label, path1, path2))
+    return f_list
+
+
+def _get_file_id(file_path: str, _ext_audio: str):
+    speaker_id, youtube_id, utterance_id = file_path.split("/")[-3:]
+    utterance_id = utterance_id.replace(_ext_audio, "")
+    file_id = "-".join([speaker_id, youtube_id, utterance_id])
+    return file_id
+
+
+class VoxCeleb1(Dataset):
+    """*VoxCeleb1* :cite:`nagrani2017voxceleb` dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (Default: ``False``).
+    """
+
+    _ext_audio = ".wav"
+
+    def __init__(self, root: Union[str, Path], download: bool = False) -> None:
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+        self._path = os.path.join(root, "wav")
+        if not os.path.isdir(self._path):
+            if not download:
+                raise RuntimeError(
+                    f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
+                )
+            _download_extract_wavs(root)
+
+    def get_metadata(self, n: int):
+        raise NotImplementedError
+
+    def __getitem__(self, n: int):
+        raise NotImplementedError
+
+    def __len__(self) -> int:
+        raise NotImplementedError
+
+
+class VoxCeleb1Identification(VoxCeleb1):
+    """*VoxCeleb1* :cite:`nagrani2017voxceleb` dataset for speaker identification task.
+
+    Each data sample contains the waveform, sample rate, speaker id, and the file id.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        subset (str, optional): Subset of the dataset to use. Options: ["train", "dev", "test"]. (Default: ``"train"``)
+        meta_url (str, optional): The url of meta file that contains the list of subset labels and file paths.
+            The format of each row is ``subset file_path". For example: ``1 id10006/nLEBBc9oIFs/00003.wav``.
+            ``1``, ``2``, ``3`` mean ``train``, ``dev``, and ``test`` subest, respectively.
+            (Default: ``"https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txt"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (Default: ``False``).
+
+    Note:
+        The file structure of `VoxCeleb1Identification` dataset is as follows:
+
+        └─ root/
+
+         └─ wav/
+
+         └─ speaker_id folders
+
+        Users who pre-downloaded the ``"vox1_dev_wav.zip"`` and ``"vox1_test_wav.zip"`` files need to move
+        the extracted files into the same ``root`` directory.
+    """
+
+    def __init__(
+        self, root: Union[str, Path], subset: str = "train", meta_url: str = _IDEN_SPLIT_URL, download: bool = False
+    ) -> None:
+        super().__init__(root, download)
+        if subset not in ["train", "dev", "test"]:
+            raise ValueError("`subset` must be one of ['train', 'dev', 'test']")
+        # download the iden_split.txt to get the train, dev, test lists.
+        meta_list_path = os.path.join(root, os.path.basename(meta_url))
+        if not os.path.exists(meta_list_path):
+            download_url_to_file(meta_url, meta_list_path)
+        self._flist = _get_flist(self._path, meta_list_path, subset)
+
+    def get_metadata(self, n: int) -> Tuple[str, int, int, str]:
+        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+
+        Args:
+            n (int): The index of the sample
+
+        Returns:
+            Tuple of the following items;
+
+            str:
+                Path to audio
+            int:
+                Sample rate
+            int:
+                Speaker ID
+            str:
+                File ID
+        """
+        file_path = self._flist[n]
+        file_id = _get_file_id(file_path, self._ext_audio)
+        speaker_id = file_id.split("-")[0]
+        speaker_id = int(speaker_id[3:])
+        return file_path, SAMPLE_RATE, speaker_id, file_id
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, int, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            int:
+                Speaker ID
+            str:
+                File ID
+        """
+        metadata = self.get_metadata(n)
+        waveform = _load_waveform(self._path, metadata[0], metadata[1])
+        return (waveform,) + metadata[1:]
+
+    def __len__(self) -> int:
+        return len(self._flist)
+
+
+class VoxCeleb1Verification(VoxCeleb1):
+    """*VoxCeleb1* :cite:`nagrani2017voxceleb` dataset for speaker verification task.
+
+    Each data sample contains a pair of waveforms, sample rate, the label indicating if they are
+    from the same speaker, and the file ids.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        meta_url (str, optional): The url of meta file that contains a list of utterance pairs
+            and the corresponding labels. The format of each row is ``label file_path1 file_path2".
+            For example: ``1 id10270/x6uYqmx31kE/00001.wav id10270/8jEAjG6SegY/00008.wav``.
+            ``1`` means the two utterances are from the same speaker, ``0`` means not.
+            (Default: ``"https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (Default: ``False``).
+
+    Note:
+        The file structure of `VoxCeleb1Verification` dataset is as follows:
+
+        └─ root/
+
+         └─ wav/
+
+         └─ speaker_id folders
+
+        Users who pre-downloaded the ``"vox1_dev_wav.zip"`` and ``"vox1_test_wav.zip"`` files need to move
+        the extracted files into the same ``root`` directory.
+    """
+
+    def __init__(self, root: Union[str, Path], meta_url: str = _VERI_TEST_URL, download: bool = False) -> None:
+        super().__init__(root, download)
+        # download the veri_test.txt to get the list of training pairs and labels.
+        meta_list_path = os.path.join(root, os.path.basename(meta_url))
+        if not os.path.exists(meta_list_path):
+            download_url_to_file(meta_url, meta_list_path)
+        self._flist = _get_paired_flist(self._path, meta_list_path)
+
+    def get_metadata(self, n: int) -> Tuple[str, str, int, int, str, str]:
+        """Get metadata for the n-th sample from the dataset. Returns filepaths instead of waveforms,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+
+        Args:
+            n (int): The index of the sample
+
+        Returns:
+            Tuple of the following items;
+
+            str:
+                Path to audio file of speaker 1
+            str:
+                Path to audio file of speaker 2
+            int:
+                Sample rate
+            int:
+                Label
+            str:
+                File ID of speaker 1
+            str:
+                File ID of speaker 2
+        """
+        label, file_path_spk1, file_path_spk2 = self._flist[n]
+        label = int(label)
+        file_id_spk1 = _get_file_id(file_path_spk1, self._ext_audio)
+        file_id_spk2 = _get_file_id(file_path_spk2, self._ext_audio)
+        return file_path_spk1, file_path_spk2, SAMPLE_RATE, label, file_id_spk1, file_id_spk2
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, Tensor, int, int, str, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded.
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform of speaker 1
+            Tensor:
+                Waveform of speaker 2
+            int:
+                Sample rate
+            int:
+                Label
+            str:
+                File ID of speaker 1
+            str:
+                File ID of speaker 2
+        """
+        metadata = self.get_metadata(n)
+        waveform_spk1 = _load_waveform(self._path, metadata[0], metadata[2])
+        waveform_spk2 = _load_waveform(self._path, metadata[1], metadata[2])
+        return (waveform_spk1, waveform_spk2) + metadata[2:]
+
+    def __len__(self) -> int:
+        return len(self._flist)
diff --git a/MLPY/Lib/site-packages/torchaudio/datasets/yesno.py b/MLPY/Lib/site-packages/torchaudio/datasets/yesno.py
new file mode 100644
index 0000000000000000000000000000000000000000..c821528b0c27cf1d583247561e7b8acb192b5c11
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/datasets/yesno.py
@@ -0,0 +1,89 @@
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar
+
+
+_RELEASE_CONFIGS = {
+    "release1": {
+        "folder_in_archive": "waves_yesno",
+        "url": "http://www.openslr.org/resources/1/waves_yesno.tar.gz",
+        "checksum": "c3f49e0cca421f96b75b41640749167b52118f232498667ca7a5f9416aef8e73",
+    }
+}
+
+
+class YESNO(Dataset):
+    """*YesNo* :cite:`YesNo` dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from.
+            (default: ``"http://www.openslr.org/resources/1/waves_yesno.tar.gz"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"waves_yesno"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = _RELEASE_CONFIGS["release1"]["url"],
+        folder_in_archive: str = _RELEASE_CONFIGS["release1"]["folder_in_archive"],
+        download: bool = False,
+    ) -> None:
+
+        self._parse_filesystem(root, url, folder_in_archive, download)
+
+    def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None:
+        root = Path(root)
+        archive = os.path.basename(url)
+        archive = root / archive
+
+        self._path = root / folder_in_archive
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _RELEASE_CONFIGS["release1"]["checksum"]
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                _extract_tar(archive)
+
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
+
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*.wav"))
+
+    def _load_item(self, fileid: str, path: str):
+        labels = [int(c) for c in fileid.split("_")]
+        file_audio = os.path.join(path, fileid + ".wav")
+        waveform, sample_rate = torchaudio.load(file_audio)
+        return waveform, sample_rate, labels
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, List[int]]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            Tuple of the following items;
+
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            List[int]:
+                labels
+        """
+        fileid = self._walker[n]
+        item = self._load_item(fileid, self._path)
+        return item
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/MLPY/Lib/site-packages/torchaudio/functional/__init__.py b/MLPY/Lib/site-packages/torchaudio/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4e25b0b1f1c096e2d4dea2a3882634ae8878242
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/functional/__init__.py
@@ -0,0 +1,127 @@
+from ._alignment import forced_align, merge_tokens, TokenSpan
+from .filtering import (
+    allpass_biquad,
+    band_biquad,
+    bandpass_biquad,
+    bandreject_biquad,
+    bass_biquad,
+    biquad,
+    contrast,
+    dcshift,
+    deemph_biquad,
+    dither,
+    equalizer_biquad,
+    filtfilt,
+    flanger,
+    gain,
+    highpass_biquad,
+    lfilter,
+    lowpass_biquad,
+    overdrive,
+    phaser,
+    riaa_biquad,
+    treble_biquad,
+    vad,
+)
+from .functional import (
+    add_noise,
+    amplitude_to_DB,
+    apply_beamforming,
+    apply_codec,
+    compute_deltas,
+    convolve,
+    create_dct,
+    DB_to_amplitude,
+    deemphasis,
+    detect_pitch_frequency,
+    edit_distance,
+    fftconvolve,
+    frechet_distance,
+    griffinlim,
+    inverse_spectrogram,
+    linear_fbanks,
+    loudness,
+    mask_along_axis,
+    mask_along_axis_iid,
+    melscale_fbanks,
+    mu_law_decoding,
+    mu_law_encoding,
+    mvdr_weights_rtf,
+    mvdr_weights_souden,
+    phase_vocoder,
+    pitch_shift,
+    preemphasis,
+    psd,
+    resample,
+    rnnt_loss,
+    rtf_evd,
+    rtf_power,
+    sliding_window_cmn,
+    spectral_centroid,
+    spectrogram,
+    speed,
+)
+
+__all__ = [
+    "amplitude_to_DB",
+    "compute_deltas",
+    "create_dct",
+    "melscale_fbanks",
+    "linear_fbanks",
+    "DB_to_amplitude",
+    "loudness",
+    "detect_pitch_frequency",
+    "griffinlim",
+    "mask_along_axis",
+    "mask_along_axis_iid",
+    "mu_law_encoding",
+    "mu_law_decoding",
+    "phase_vocoder",
+    "sliding_window_cmn",
+    "spectrogram",
+    "inverse_spectrogram",
+    "spectral_centroid",
+    "allpass_biquad",
+    "band_biquad",
+    "bandpass_biquad",
+    "bandreject_biquad",
+    "bass_biquad",
+    "biquad",
+    "contrast",
+    "dither",
+    "dcshift",
+    "deemph_biquad",
+    "equalizer_biquad",
+    "filtfilt",
+    "flanger",
+    "forced_align",
+    "merge_tokens",
+    "TokenSpan",
+    "gain",
+    "highpass_biquad",
+    "lfilter",
+    "lowpass_biquad",
+    "overdrive",
+    "phaser",
+    "riaa_biquad",
+    "treble_biquad",
+    "vad",
+    "apply_codec",
+    "resample",
+    "edit_distance",
+    "pitch_shift",
+    "rnnt_loss",
+    "psd",
+    "mvdr_weights_souden",
+    "mvdr_weights_rtf",
+    "rtf_evd",
+    "rtf_power",
+    "apply_beamforming",
+    "fftconvolve",
+    "convolve",
+    "add_noise",
+    "speed",
+    "preemphasis",
+    "deemphasis",
+    "frechet_distance",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c50e0d3709c9097e8f2439fbc646b8cdb1f2dd7a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df9e86d3ffc53b5c921b558dbe813da9d1d9e9a9
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/filtering.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/filtering.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef94718ae86f9ee53b1b09b1665b1df4bf79520c
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/filtering.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/functional.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf3c6666158c38f6674e328fd6e3a6553d7c135b
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/functional/__pycache__/functional.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/functional/_alignment.py b/MLPY/Lib/site-packages/torchaudio/functional/_alignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..13a408b3b6dd2e2d48390201685244c239da8cd8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/functional/_alignment.py
@@ -0,0 +1,128 @@
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+from torchaudio._extension import fail_if_no_align
+
+__all__ = []
+
+
+@fail_if_no_align
+def forced_align(
+    log_probs: Tensor,
+    targets: Tensor,
+    input_lengths: Optional[Tensor] = None,
+    target_lengths: Optional[Tensor] = None,
+    blank: int = 0,
+) -> Tuple[Tensor, Tensor]:
+    r"""Align a CTC label sequence to an emission.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        log_probs (Tensor): log probability of CTC emission output.
+            Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length,
+            `C` is the number of characters in alphabet including blank.
+        targets (Tensor): Target sequence. Tensor of shape `(B, L)`,
+            where `L` is the target length.
+        input_lengths (Tensor or None, optional):
+            Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`.
+        target_lengths (Tensor or None, optional):
+            Lengths of the targets. 1-D Tensor of shape `(B,)`.
+        blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0)
+
+    Returns:
+        Tuple(Tensor, Tensor):
+            Tensor: Label for each time step in the alignment path computed using forced alignment.
+
+            Tensor: Log probability scores of the labels for each time step.
+
+    Note:
+        The sequence length of `log_probs` must satisfy:
+
+
+        .. math::
+            L_{\text{log\_probs}} \ge L_{\text{label}} + N_{\text{repeat}}
+
+        where :math:`N_{\text{repeat}}` is the number of consecutively repeated tokens.
+        For example, in str `"aabbc"`, the number of repeats are `2`.
+
+    Note:
+        The current version only supports ``batch_size==1``.
+    """
+    if blank in targets:
+        raise ValueError(f"targets Tensor shouldn't contain blank index. Found {targets}.")
+    if torch.max(targets) >= log_probs.shape[-1]:
+        raise ValueError("targets values must be less than the CTC dimension")
+
+    if input_lengths is None:
+        batch_size, length = log_probs.size(0), log_probs.size(1)
+        input_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=log_probs.device)
+    if target_lengths is None:
+        batch_size, length = targets.size(0), targets.size(1)
+        target_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=targets.device)
+
+    # For TorchScript compatibility
+    assert input_lengths is not None
+    assert target_lengths is not None
+
+    paths, scores = torch.ops.torchaudio.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
+    return paths, scores
+
+
+@dataclass
+class TokenSpan:
+    """TokenSpan()
+    Token with time stamps and score. Returned by :py:func:`merge_tokens`.
+    """
+
+    token: int
+    """The token"""
+    start: int
+    """The start time (inclusive) in emission time axis."""
+    end: int
+    """The end time (exclusive) in emission time axis."""
+    score: float
+    """The score of the this token."""
+
+    def __len__(self) -> int:
+        """Returns the time span"""
+        return self.end - self.start
+
+
+def merge_tokens(tokens: Tensor, scores: Tensor, blank: int = 0) -> List[TokenSpan]:
+    """Removes repeated tokens and blank tokens from the given CTC token sequence.
+
+    Args:
+        tokens (Tensor): Alignment tokens (unbatched) returned from :py:func:`forced_align`.
+            Shape: `(time, )`.
+        scores (Tensor): Alignment scores (unbatched) returned from :py:func:`forced_align`.
+            Shape: `(time, )`. When computing the token-size score, the given score is averaged
+            across the corresponding time span.
+
+    Returns:
+        list of TokenSpan
+
+    Example:
+        >>> aligned_tokens, scores = forced_align(emission, targets, input_lengths, target_lengths)
+        >>> token_spans = merge_tokens(aligned_tokens[0], scores[0])
+    """
+    if tokens.ndim != 1 or scores.ndim != 1:
+        raise ValueError("`tokens` and `scores` must be 1D Tensor.")
+    if len(tokens) != len(scores):
+        raise ValueError("`tokens` and `scores` must be the same length.")
+
+    diff = torch.diff(
+        tokens, prepend=torch.tensor([-1], device=tokens.device), append=torch.tensor([-1], device=tokens.device)
+    )
+    changes_wo_blank = torch.nonzero((diff != 0)).squeeze().tolist()
+    tokens = tokens.tolist()
+    spans = [
+        TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
+        for start, end in zip(changes_wo_blank[:-1], changes_wo_blank[1:])
+        if (token := tokens[start]) != blank
+    ]
+    return spans
diff --git a/MLPY/Lib/site-packages/torchaudio/functional/filtering.py b/MLPY/Lib/site-packages/torchaudio/functional/filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..18320e5d415e0c1a1d15eb0d493abcc668da0554
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/functional/filtering.py
@@ -0,0 +1,1669 @@
+import math
+import warnings
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from torchaudio._extension import _IS_TORCHAUDIO_EXT_AVAILABLE
+
+
+def _dB2Linear(x: float) -> float:
+    return math.exp(x * math.log(10) / 20.0)
+
+
+def _generate_wave_table(
+    wave_type: str,
+    data_type: str,
+    table_size: int,
+    min: float,
+    max: float,
+    phase: float,
+    device: torch.device,
+) -> Tensor:
+    r"""A helper function for phaser. Generates a table with given parameters.
+
+    Args:
+        wave_type (str): SINE or TRIANGULAR
+        data_type (str): desired data_type ( `INT` or `FLOAT` )
+        table_size (int): desired table size
+        min (float): desired min value
+        max (float): desired max value
+        phase (float): desired phase
+        device (torch.device): Torch device on which table must be generated
+    Returns:
+        Tensor: A 1D tensor with wave table values
+    """
+
+    phase_offset = int(phase / math.pi / 2 * table_size + 0.5)
+
+    t = torch.arange(table_size, device=device, dtype=torch.int32)
+
+    point = (t + phase_offset) % table_size
+
+    d = torch.zeros_like(point, device=device, dtype=torch.float64)
+
+    if wave_type == "SINE":
+        d = (torch.sin(point.to(torch.float64) / table_size * 2 * math.pi) + 1) / 2
+    elif wave_type == "TRIANGLE":
+        d = point.to(torch.float64) * 2 / table_size
+        value = torch.div(4 * point, table_size, rounding_mode="floor")
+        d[value == 0] = d[value == 0] + 0.5
+        d[value == 1] = 1.5 - d[value == 1]
+        d[value == 2] = 1.5 - d[value == 2]
+        d[value == 3] = d[value == 3] - 1.5
+
+    d = d * (max - min) + min
+
+    if data_type == "INT":
+        mask = d < 0
+        d[mask] = d[mask] - 0.5
+        d[~mask] = d[~mask] + 0.5
+        d = d.to(torch.int32)
+    elif data_type == "FLOAT":
+        d = d.to(torch.float32)
+
+    return d
+
+
+def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design two-pole all-pass filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform(torch.Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = 1 - alpha
+    b1 = -2 * torch.cos(w0)
+    b2 = 1 + alpha
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def band_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    central_freq: float,
+    Q: float = 0.707,
+    noise: bool = False,
+) -> Tensor:
+    r"""Design two-pole band filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+        noise (bool, optional) : If ``True``, uses the alternate mode for un-pitched audio (e.g. percussion).
+            If ``False``, uses mode oriented to pitched audio, i.e. voice, singing,
+            or instrumental music (Default: ``False``).
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    bw_Hz = central_freq / Q
+
+    a0 = 1.0
+    a2 = torch.exp(-2 * math.pi * bw_Hz / sample_rate)
+    a1 = -4 * a2 / (1 + a2) * torch.cos(w0)
+
+    b0 = torch.sqrt(1 - a1 * a1 / (4 * a2)) * (1 - a2)
+
+    if noise:
+        mult = torch.sqrt(((1 + a2) * (1 + a2) - a1 * a1) * (1 - a2) / (1 + a2)) / b0
+        b0 = mult * b0
+
+    b1 = 0.0
+    b2 = 0.0
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def bandpass_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    central_freq: float,
+    Q: float = 0.707,
+    const_skirt_gain: bool = False,
+) -> Tensor:
+    r"""Design two-pole band-pass filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+        const_skirt_gain (bool, optional) : If ``True``, uses a constant skirt gain (peak gain = Q).
+            If ``False``, uses a constant 0dB peak gain. (Default: ``False``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+
+    temp = torch.sin(w0) / 2 if const_skirt_gain else alpha
+    b0 = temp
+    b1 = 0.0
+    b2 = -temp
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design two-pole band-reject filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = 1.0
+    b1 = -2 * torch.cos(w0)
+    b2 = 1.0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def bass_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    gain: float,
+    central_freq: float = 100,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design a bass tone-control effect.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
+        central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``100``)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    A = torch.exp(gain / 40 * math.log(10))
+
+    temp1 = 2 * torch.sqrt(A) * alpha
+    temp2 = (A - 1) * torch.cos(w0)
+    temp3 = (A + 1) * torch.cos(w0)
+
+    b0 = A * ((A + 1) - temp2 + temp1)
+    b1 = 2 * A * ((A - 1) - temp3)
+    b2 = A * ((A + 1) - temp2 - temp1)
+    a0 = (A + 1) + temp2 + temp1
+    a1 = -2 * ((A - 1) + temp3)
+    a2 = (A + 1) + temp2 - temp1
+
+    return biquad(waveform, b0 / a0, b1 / a0, b2 / a0, a0 / a0, a1 / a0, a2 / a0)
+
+
+def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor:
+    r"""Perform a biquad filter of input tensor.  Initial conditions set to 0.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        b0 (float or torch.Tensor): numerator coefficient of current input, x[n]
+        b1 (float or torch.Tensor): numerator coefficient of input one time step ago x[n-1]
+        b2 (float or torch.Tensor): numerator coefficient of input two time steps ago x[n-2]
+        a0 (float or torch.Tensor): denominator coefficient of current output y[n], typically 1
+        a1 (float or torch.Tensor): denominator coefficient of current output y[n-1]
+        a2 (float or torch.Tensor): denominator coefficient of current output y[n-2]
+
+    Returns:
+        Tensor: Waveform with dimension of `(..., time)`
+
+    Reference:
+       - https://en.wikipedia.org/wiki/Digital_biquad_filter
+    """
+
+    device = waveform.device
+    dtype = waveform.dtype
+
+    b0 = torch.as_tensor(b0, dtype=dtype, device=device).view(1)
+    b1 = torch.as_tensor(b1, dtype=dtype, device=device).view(1)
+    b2 = torch.as_tensor(b2, dtype=dtype, device=device).view(1)
+    a0 = torch.as_tensor(a0, dtype=dtype, device=device).view(1)
+    a1 = torch.as_tensor(a1, dtype=dtype, device=device).view(1)
+    a2 = torch.as_tensor(a2, dtype=dtype, device=device).view(1)
+
+    output_waveform = lfilter(
+        waveform,
+        torch.cat([a0, a1, a2]),
+        torch.cat([b0, b1, b2]),
+    )
+    return output_waveform
+
+
+def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
+    r"""Apply contrast effect.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Comparable with compression, this effect modifies an audio signal to make it sound louder
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        enhancement_amount (float, optional): controls the amount of the enhancement
+            Allowed range of values for enhancement_amount : 0-100
+            Note that enhancement_amount = 0 still gives a significant contrast enhancement
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+
+    if not 0 <= enhancement_amount <= 100:
+        raise ValueError("Allowed range of values for enhancement_amount : 0-100")
+
+    contrast = enhancement_amount / 750.0
+
+    temp1 = waveform * (math.pi / 2)
+    temp2 = contrast * torch.sin(temp1 * 4)
+    output_waveform = torch.sin(temp1 + temp2)
+
+    return output_waveform
+
+
+def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor:
+    r"""Apply a DC shift to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    This can be useful to remove a DC offset
+    (caused perhaps by a hardware problem in the recording chain) from the audio
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        shift (float): indicates the amount to shift the audio
+            Allowed range of values for shift : -2.0 to +2.0
+        limiter_gain (float of None, optional): It is used only on peaks to prevent clipping
+            It should have a value much less than 1 (e.g. 0.05 or 0.02)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    output_waveform = waveform
+    limiter_threshold = 0.0
+
+    if limiter_gain is not None:
+        limiter_threshold = 1.0 - (abs(shift) - limiter_gain)
+
+    # Note:
+    # the following index-based update breaks auto-grad support
+    if limiter_gain is not None and shift > 0:
+        mask = waveform > limiter_threshold
+        temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold)
+        output_waveform[mask] = (temp + limiter_threshold + shift).clamp(max=limiter_threshold)
+        output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
+    elif limiter_gain is not None and shift < 0:
+        mask = waveform < -limiter_threshold
+        temp = (waveform[mask] + limiter_threshold) * limiter_gain / (1 - limiter_threshold)
+        output_waveform[mask] = (temp - limiter_threshold + shift).clamp(min=-limiter_threshold)
+        output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
+    else:
+        output_waveform = (waveform + shift).clamp(min=-1, max=1)
+
+    return output_waveform
+
+
+def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
+    r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000``
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+
+    if sample_rate == 44100:
+        central_freq = 5283
+        width_slope = 0.4845
+        gain = -9.477
+    elif sample_rate == 48000:
+        central_freq = 5356
+        width_slope = 0.479
+        gain = -9.62
+    else:
+        raise ValueError("Sample rate must be 44100 (audio-CD) or 48000 (DAT)")
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    A = math.exp(gain / 40.0 * math.log(10))
+    alpha = math.sin(w0) / 2 * math.sqrt((A + 1 / A) * (1 / width_slope - 1) + 2)
+
+    temp1 = 2 * math.sqrt(A) * alpha
+    temp2 = (A - 1) * math.cos(w0)
+    temp3 = (A + 1) * math.cos(w0)
+
+    b0 = A * ((A + 1) + temp2 + temp1)
+    b1 = -2 * A * ((A - 1) + temp3)
+    b2 = A * ((A + 1) + temp2 - temp1)
+    a0 = (A + 1) - temp2 + temp1
+    a1 = 2 * ((A - 1) - temp3)
+    a2 = (A + 1) - temp2 - temp1
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _add_noise_shaping(dithered_waveform: Tensor, waveform: Tensor) -> Tensor:
+    r"""Noise shaping is calculated by error:
+    error[n] = dithered[n] - original[n]
+    noise_shaped_waveform[n] = dithered[n] + error[n-1]
+    """
+    wf_shape = waveform.size()
+    waveform = waveform.reshape(-1, wf_shape[-1])
+
+    dithered_shape = dithered_waveform.size()
+    dithered_waveform = dithered_waveform.reshape(-1, dithered_shape[-1])
+
+    error = dithered_waveform - waveform
+
+    # add error[n-1] to dithered_waveform[n], so offset the error by 1 index
+    zeros = torch.zeros(1, dtype=error.dtype, device=error.device)
+    for index in range(error.size()[0]):
+        err = error[index]
+        error_offset = torch.cat((zeros, err))
+        error[index] = error_offset[: waveform.size()[1]]
+
+    noise_shaped = dithered_waveform + error
+    return noise_shaped.reshape(dithered_shape[:-1] + noise_shaped.shape[-1:])
+
+
+def _apply_probability_distribution(waveform: Tensor, density_function: str = "TPDF") -> Tensor:
+    r"""Apply a probability distribution function on a waveform.
+
+    Triangular probability density function (TPDF) dither noise has a
+    triangular distribution; values in the center of the range have a higher
+    probability of occurring.
+
+    Rectangular probability density function (RPDF) dither noise has a
+    uniform distribution; any value in the specified range has the same
+    probability of occurring.
+
+    Gaussian probability density function (GPDF) has a normal distribution.
+    The relationship of probabilities of results follows a bell-shaped,
+    or Gaussian curve, typical of dither generated by analog sources.
+    Args:
+        waveform (Tensor): Tensor of audio of dimension (..., time)
+        density_function (str, optional): The density function of a
+           continuous random variable (Default: ``"TPDF"``)
+           Options: Triangular Probability Density Function - `TPDF`
+                    Rectangular Probability Density Function - `RPDF`
+                    Gaussian Probability Density Function - `GPDF`
+    Returns:
+        Tensor: waveform dithered with TPDF
+    """
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+
+    channel_size = waveform.size()[0] - 1
+    time_size = waveform.size()[-1] - 1
+
+    random_channel = (
+        int(
+            torch.randint(
+                channel_size,
+                [
+                    1,
+                ],
+            ).item()
+        )
+        if channel_size > 0
+        else 0
+    )
+    random_time = (
+        int(
+            torch.randint(
+                time_size,
+                [
+                    1,
+                ],
+            ).item()
+        )
+        if time_size > 0
+        else 0
+    )
+
+    number_of_bits = 16
+    up_scaling = 2 ** (number_of_bits - 1) - 2
+    signal_scaled = waveform * up_scaling
+    down_scaling = 2 ** (number_of_bits - 1)
+
+    signal_scaled_dis = waveform
+    if density_function == "RPDF":
+        RPDF = waveform[random_channel][random_time] - 0.5
+
+        signal_scaled_dis = signal_scaled + RPDF
+    elif density_function == "GPDF":
+        # TODO Replace by distribution code once
+        # https://github.com/pytorch/pytorch/issues/29843 is resolved
+        # gaussian = torch.distributions.normal.Normal(torch.mean(waveform, -1), 1).sample()
+
+        num_rand_variables = 6
+
+        gaussian = waveform[random_channel][random_time]
+        for ws in num_rand_variables * [time_size]:
+            rand_chan = int(
+                torch.randint(
+                    channel_size,
+                    [
+                        1,
+                    ],
+                ).item()
+            )
+            gaussian += waveform[rand_chan][
+                int(
+                    torch.randint(
+                        ws,
+                        [
+                            1,
+                        ],
+                    ).item()
+                )
+            ]
+
+        signal_scaled_dis = signal_scaled + gaussian
+    else:
+        # dtype needed for https://github.com/pytorch/pytorch/issues/32358
+        TPDF = torch.bartlett_window(time_size + 1, dtype=signal_scaled.dtype, device=signal_scaled.device)
+        TPDF = TPDF.repeat((channel_size + 1), 1)
+        signal_scaled_dis = signal_scaled + TPDF
+
+    quantised_signal_scaled = torch.round(signal_scaled_dis)
+    quantised_signal = quantised_signal_scaled / down_scaling
+
+    # unpack batch
+    return quantised_signal.reshape(shape[:-1] + quantised_signal.shape[-1:])
+
+
+def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor:
+    r"""Apply dither
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Dither increases the perceived dynamic range of audio stored at a
+    particular bit-depth by eliminating nonlinear truncation distortion
+    (i.e. adding minimally perceived noise to mask distortion caused by quantization).
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension (..., time)
+        density_function (str, optional):
+            The density function of a continuous random variable. One of
+            ``"TPDF"`` (Triangular Probability Density Function),
+            ``"RPDF"`` (Rectangular Probability Density Function) or
+            ``"GPDF"`` (Gaussian Probability Density Function) (Default: ``"TPDF"``).
+        noise_shaping (bool, optional): a filtering process that shapes the spectral
+            energy of quantisation error (Default: ``False``)
+
+    Returns:
+       Tensor: waveform dithered
+    """
+    dithered = _apply_probability_distribution(waveform, density_function=density_function)
+
+    if noise_shaping:
+        return _add_noise_shaping(dithered, waveform)
+    else:
+        return dithered
+
+
+def equalizer_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    center_freq: float,
+    gain: float,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design biquad peaking equalizer filter and perform filtering.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        center_freq (float): filter's central frequency
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    center_freq = torch.as_tensor(center_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * center_freq / sample_rate
+    A = torch.exp(gain / 40.0 * math.log(10))
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = 1 + alpha * A
+    b1 = -2 * torch.cos(w0)
+    b2 = 1 - alpha * A
+    a0 = 1 + alpha / A
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha / A
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def filtfilt(
+    waveform: Tensor,
+    a_coeffs: Tensor,
+    b_coeffs: Tensor,
+    clamp: bool = True,
+) -> Tensor:
+    r"""Apply an IIR filter forward and backward to a waveform.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`.  Must be normalized to -1 to 1.
+        a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delay coefficients are first, e.g. ``[a0, a1, a2, ...]``.
+                                Must be same size as b_coeffs (pad with 0's as necessary).
+        b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delay coefficients are first, e.g. ``[b0, b1, b2, ...]``.
+                                Must be same size as a_coeffs (pad with 0's as necessary).
+        clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
+
+    Returns:
+        Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
+        are 2D Tensors, or `(..., time)` otherwise.
+    """
+    forward_filtered = lfilter(waveform, a_coeffs, b_coeffs, clamp=False, batching=True)
+    backward_filtered = lfilter(
+        forward_filtered.flip(-1),
+        a_coeffs,
+        b_coeffs,
+        clamp=clamp,
+        batching=True,
+    ).flip(-1)
+    return backward_filtered
+
+
+def flanger(
+    waveform: Tensor,
+    sample_rate: int,
+    delay: float = 0.0,
+    depth: float = 2.0,
+    regen: float = 0.0,
+    width: float = 71.0,
+    speed: float = 0.5,
+    phase: float = 25.0,
+    modulation: str = "sinusoidal",
+    interpolation: str = "linear",
+) -> Tensor:
+    r"""Apply a flanger effect to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., channel, time)` .
+            Max 4 channels allowed
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        delay (float, optional): desired delay in milliseconds(ms)
+            Allowed range of values are 0 to 30
+        depth (float, optional): desired delay depth in milliseconds(ms)
+            Allowed range of values are 0 to 10
+        regen (float, optional): desired regen(feedback gain) in dB
+            Allowed range of values are -95 to 95
+        width (float, optional):  desired width(delay gain) in dB
+            Allowed range of values are 0 to 100
+        speed (float, optional):  modulation speed in Hz
+            Allowed range of values are 0.1 to 10
+        phase (float, optional):  percentage phase-shift for multi-channel
+            Allowed range of values are 0 to 100
+        modulation (str, optional):  Use either "sinusoidal" or "triangular" modulation. (Default: ``sinusoidal``)
+        interpolation (str, optional): Use either "linear" or "quadratic" for delay-line interpolation.
+            (Default: ``linear``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., channel, time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+
+        - Scott Lehman, `Effects Explained`_,
+
+    .. _Effects Explained:
+        https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
+    """
+
+    if modulation not in ("sinusoidal", "triangular"):
+        raise ValueError('Only "sinusoidal" or "triangular" modulation allowed')
+
+    if interpolation not in ("linear", "quadratic"):
+        raise ValueError('Only "linear" or "quadratic" interpolation allowed')
+
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+
+    if actual_shape[-2] > 4:
+        raise ValueError("Max 4 channels allowed")
+
+    # convert to 3D (batch, channels, time)
+    waveform = waveform.view(-1, actual_shape[-2], actual_shape[-1])
+
+    # Scaling
+    feedback_gain = regen / 100
+    delay_gain = width / 100
+    channel_phase = phase / 100
+    delay_min = delay / 1000
+    delay_depth = depth / 1000
+
+    n_channels = waveform.shape[-2]
+
+    if modulation == "sinusoidal":
+        wave_type = "SINE"
+    else:
+        wave_type = "TRIANGLE"
+
+    # Balance output:
+    in_gain = 1.0 / (1 + delay_gain)
+    delay_gain = delay_gain / (1 + delay_gain)
+
+    # Balance feedback loop:
+    delay_gain = delay_gain * (1 - abs(feedback_gain))
+
+    delay_buf_length = int((delay_min + delay_depth) * sample_rate + 0.5)
+    delay_buf_length = delay_buf_length + 2
+
+    delay_bufs = torch.zeros(waveform.shape[0], n_channels, delay_buf_length, dtype=dtype, device=device)
+    delay_last = torch.zeros(waveform.shape[0], n_channels, dtype=dtype, device=device)
+
+    lfo_length = int(sample_rate / speed)
+
+    table_min = math.floor(delay_min * sample_rate + 0.5)
+    table_max = delay_buf_length - 2.0
+
+    lfo = _generate_wave_table(
+        wave_type=wave_type,
+        data_type="FLOAT",
+        table_size=lfo_length,
+        min=float(table_min),
+        max=float(table_max),
+        phase=3 * math.pi / 2,
+        device=device,
+    )
+
+    output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
+
+    delay_buf_pos = 0
+    lfo_pos = 0
+    channel_idxs = torch.arange(0, n_channels, device=device)
+
+    for i in range(waveform.shape[-1]):
+
+        delay_buf_pos = (delay_buf_pos + delay_buf_length - 1) % delay_buf_length
+
+        cur_channel_phase = (channel_idxs * lfo_length * channel_phase + 0.5).to(torch.int64)
+        delay_tensor = lfo[(lfo_pos + cur_channel_phase) % lfo_length]
+        frac_delay = torch.frac(delay_tensor)
+        delay_tensor = torch.floor(delay_tensor)
+
+        int_delay = delay_tensor.to(torch.int64)
+
+        temp = waveform[:, :, i]
+
+        delay_bufs[:, :, delay_buf_pos] = temp + delay_last * feedback_gain
+
+        delayed_0 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+
+        int_delay = int_delay + 1
+
+        delayed_1 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+
+        int_delay = int_delay + 1
+
+        if interpolation == "linear":
+            delayed = delayed_0 + (delayed_1 - delayed_0) * frac_delay
+        else:
+            delayed_2 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+
+            int_delay = int_delay + 1
+
+            delayed_2 = delayed_2 - delayed_0
+            delayed_1 = delayed_1 - delayed_0
+            a = delayed_2 * 0.5 - delayed_1
+            b = delayed_1 * 2 - delayed_2 * 0.5
+
+            delayed = delayed_0 + (a * frac_delay + b) * frac_delay
+
+        delay_last = delayed
+        output_waveform[:, :, i] = waveform[:, :, i] * in_gain + delayed * delay_gain
+
+        lfo_pos = (lfo_pos + 1) % lfo_length
+
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+
+
+def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
+    r"""Apply amplification or attenuation to the whole waveform.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+       waveform (Tensor): Tensor of audio of dimension (..., time).
+       gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``).
+
+    Returns:
+       Tensor: the whole waveform amplified by gain_db.
+    """
+    if gain_db == 0:
+        return waveform
+
+    ratio = 10 ** (gain_db / 20)
+
+    return waveform * ratio
+
+
+def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design biquad highpass filter and perform filtering.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        cutoff_freq (float or torch.Tensor): filter cutoff frequency
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * cutoff_freq / sample_rate
+    alpha = torch.sin(w0) / 2.0 / Q
+
+    b0 = (1 + torch.cos(w0)) / 2
+    b1 = -1 - torch.cos(w0)
+    b2 = b0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _lfilter_core_generic_loop(input_signal_windows: Tensor, a_coeffs_flipped: Tensor, padded_output_waveform: Tensor):
+    n_order = a_coeffs_flipped.size(1)
+    a_coeffs_flipped = a_coeffs_flipped.unsqueeze(2)
+    for i_sample, o0 in enumerate(input_signal_windows.permute(2, 0, 1)):
+        windowed_output_signal = padded_output_waveform[:, :, i_sample : i_sample + n_order]
+        o0 -= (windowed_output_signal.transpose(0, 1) @ a_coeffs_flipped)[..., 0].t()
+        padded_output_waveform[:, :, i_sample + n_order - 1] = o0
+
+
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _lfilter_core_cpu_loop = torch.ops.torchaudio._lfilter_core_loop
+else:
+    _lfilter_core_cpu_loop = _lfilter_core_generic_loop
+
+
+def _lfilter_core(
+    waveform: Tensor,
+    a_coeffs: Tensor,
+    b_coeffs: Tensor,
+) -> Tensor:
+
+    if a_coeffs.size() != b_coeffs.size():
+        raise ValueError(
+            "Expected coeffs to be the same size."
+            f"Found a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
+        )
+    if waveform.ndim != 3:
+        raise ValueError(f"Expected waveform to be 3 dimensional. Found: {waveform.ndim}")
+    if not (waveform.device == a_coeffs.device == b_coeffs.device):
+        raise ValueError(
+            "Expected waveform and coeffs to be on the same device."
+            f"Found: waveform device:{waveform.device}, a_coeffs device: {a_coeffs.device}, "
+            f"b_coeffs device: {b_coeffs.device}"
+        )
+
+    n_batch, n_channel, n_sample = waveform.size()
+    n_order = a_coeffs.size(1)
+    if n_order <= 0:
+        raise ValueError(f"Expected n_order to be positive. Found: {n_order}")
+
+    # Pad the input and create output
+
+    padded_waveform = torch.nn.functional.pad(waveform, [n_order - 1, 0])
+    padded_output_waveform = torch.zeros_like(padded_waveform)
+
+    # Set up the coefficients matrix
+    # Flip coefficients' order
+    a_coeffs_flipped = a_coeffs.flip(1)
+    b_coeffs_flipped = b_coeffs.flip(1)
+
+    # calculate windowed_input_signal in parallel using convolution
+    input_signal_windows = torch.nn.functional.conv1d(padded_waveform, b_coeffs_flipped.unsqueeze(1), groups=n_channel)
+
+    input_signal_windows.div_(a_coeffs[:, :1])
+    a_coeffs_flipped.div_(a_coeffs[:, :1])
+
+    if (
+        input_signal_windows.device == torch.device("cpu")
+        and a_coeffs_flipped.device == torch.device("cpu")
+        and padded_output_waveform.device == torch.device("cpu")
+    ):
+        _lfilter_core_cpu_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
+    else:
+        _lfilter_core_generic_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
+
+    output = padded_output_waveform[:, :, n_order - 1 :]
+    return output
+
+
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _lfilter = torch.ops.torchaudio._lfilter
+else:
+    _lfilter = _lfilter_core
+
+
+def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor:
+    r"""Perform an IIR filter by evaluating difference equation, using differentiable implementation
+    developed independently by *Yu et al.* :cite:`ismir_YuF23` and *Forgione et al.* :cite:`forgione2021dynonet`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        To avoid numerical problems, small filter order is preferred.
+        Using double precision could also minimize numerical precision errors.
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`.  Must be normalized to -1 to 1.
+        a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
+                                Must be same size as b_coeffs (pad with 0's as necessary).
+        b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
+                                Must be same size as a_coeffs (pad with 0's as necessary).
+        clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
+        batching (bool, optional): Effective only when coefficients are 2D. If ``True``, then waveform should be at
+                                    least 2D, and the size of second axis from last should equals to ``num_filters``.
+                                    The output can be expressed as ``output[..., i, :] = lfilter(waveform[..., i, :],
+                                    a_coeffs[i], b_coeffs[i], clamp=clamp, batching=False)``. (Default: ``True``)
+
+    Returns:
+        Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
+        are 2D Tensors, or `(..., time)` otherwise.
+    """
+    if a_coeffs.size() != b_coeffs.size():
+        raise ValueError(
+            "Expected coeffs to be the same size."
+            f"Found: a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
+        )
+    if a_coeffs.ndim > 2:
+        raise ValueError(f"Expected coeffs to have greater than 1 dimension. Found: {a_coeffs.ndim}")
+
+    if a_coeffs.ndim > 1:
+        if batching:
+            if waveform.ndim <= 0:
+                raise ValueError("Expected waveform to have a positive number of dimensions." f"Found: {waveform.ndim}")
+            if waveform.shape[-2] != a_coeffs.shape[0]:
+                raise ValueError(
+                    "Expected number of batches in waveform and coeffs to be the same."
+                    f"Found: coeffs batches: {a_coeffs.shape[0]}, waveform batches: {waveform.shape[-2]}"
+                )
+        else:
+            waveform = torch.stack([waveform] * a_coeffs.shape[0], -2)
+    else:
+        a_coeffs = a_coeffs.unsqueeze(0)
+        b_coeffs = b_coeffs.unsqueeze(0)
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, a_coeffs.shape[0], shape[-1])
+    output = _lfilter(waveform, a_coeffs, b_coeffs)
+
+    if clamp:
+        output = torch.clamp(output, min=-1.0, max=1.0)
+
+    # unpack batch
+    output = output.reshape(shape[:-1] + output.shape[-1:])
+
+    return output
+
+
+def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design biquad lowpass filter and perform filtering.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        cutoff_freq (float or torch.Tensor): filter cutoff frequency
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * cutoff_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = (1 - torch.cos(w0)) / 2
+    b1 = 1 - torch.cos(w0)
+    b2 = b0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _overdrive_core_loop_generic(
+    waveform: Tensor, temp: Tensor, last_in: Tensor, last_out: Tensor, output_waveform: Tensor
+):
+    for i in range(waveform.shape[-1]):
+        last_out = temp[:, i] - last_in + 0.995 * last_out
+        last_in = temp[:, i]
+        output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
+
+
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
+else:
+    _overdrive_core_loop_cpu = _overdrive_core_loop_generic
+
+
+def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
+    r"""Apply a overdrive effect to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This effect applies a non linear distortion to the audio signal.
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        gain (float, optional): desired gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 100
+        colour (float, optional):  controls the amount of even harmonic content in the over-driven output
+            Allowed range of values are 0 to 100
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+
+    # convert to 2D (..,time)
+    waveform = waveform.view(-1, actual_shape[-1])
+
+    gain = _dB2Linear(gain)
+    colour = colour / 200
+    last_in = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
+    last_out = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
+
+    temp = waveform * gain + colour
+
+    mask1 = temp < -1
+    temp[mask1] = torch.tensor(-2.0 / 3.0, dtype=dtype, device=device)
+    # Wrapping the constant with Tensor is required for Torchscript
+
+    mask2 = temp > 1
+    temp[mask2] = torch.tensor(2.0 / 3.0, dtype=dtype, device=device)
+
+    mask3 = ~mask1 & ~mask2
+    temp[mask3] = temp[mask3] - (temp[mask3] ** 3) * (1.0 / 3)
+
+    output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
+
+    # Uses CPU optimized loop function if available for CPU device
+    if device == torch.device("cpu"):
+        _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
+    else:
+        _overdrive_core_loop_generic(waveform, temp, last_in, last_out, output_waveform)
+
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+
+
+def phaser(
+    waveform: Tensor,
+    sample_rate: int,
+    gain_in: float = 0.4,
+    gain_out: float = 0.74,
+    delay_ms: float = 3.0,
+    decay: float = 0.4,
+    mod_speed: float = 0.5,
+    sinusoidal: bool = True,
+) -> Tensor:
+    r"""Apply a phasing effect to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain_in (float, optional): desired input gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 1
+        gain_out (float, optional): desired output gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 1e9
+        delay_ms (float, optional): desired delay in milliseconds
+            Allowed range of values are 0 to 5.0
+        decay (float, optional):  desired decay relative to gain-in
+            Allowed range of values are 0 to 0.99
+        mod_speed (float, optional):  modulation speed in Hz
+            Allowed range of values are 0.1 to 2
+        sinusoidal (bool, optional):  If ``True``, uses sinusoidal modulation (preferable for multiple instruments)
+            If ``False``, uses triangular modulation (gives single instruments a sharper phasing effect)
+            (Default: ``True``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - Scott Lehman, `Effects Explained`_.
+
+    .. _Effects Explained:
+        https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
+    """
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+
+    # convert to 2D (channels,time)
+    waveform = waveform.view(-1, actual_shape[-1])
+
+    delay_buf_len = int((delay_ms * 0.001 * sample_rate) + 0.5)
+    delay_buf = torch.zeros(waveform.shape[0], delay_buf_len, dtype=dtype, device=device)
+
+    mod_buf_len = int(sample_rate / mod_speed + 0.5)
+
+    if sinusoidal:
+        wave_type = "SINE"
+    else:
+        wave_type = "TRIANGLE"
+
+    mod_buf = _generate_wave_table(
+        wave_type=wave_type,
+        data_type="INT",
+        table_size=mod_buf_len,
+        min=1.0,
+        max=float(delay_buf_len),
+        phase=math.pi / 2,
+        device=device,
+    )
+
+    delay_pos = 0
+    mod_pos = 0
+
+    output_waveform_pre_gain_list = []
+    waveform = waveform * gain_in
+    delay_buf = delay_buf * decay
+    waveform_list = [waveform[:, i] for i in range(waveform.size(1))]
+    delay_buf_list = [delay_buf[:, i] for i in range(delay_buf.size(1))]
+    mod_buf_list = [mod_buf[i] for i in range(mod_buf.size(0))]
+
+    for i in range(waveform.shape[-1]):
+        idx = int((delay_pos + mod_buf_list[mod_pos]) % delay_buf_len)
+        mod_pos = (mod_pos + 1) % mod_buf_len
+        delay_pos = (delay_pos + 1) % delay_buf_len
+        temp = (waveform_list[i]) + (delay_buf_list[idx])
+        delay_buf_list[delay_pos] = temp * decay
+        output_waveform_pre_gain_list.append(temp)
+
+    output_waveform = torch.stack(output_waveform_pre_gain_list, dim=1).to(dtype=dtype, device=device)
+    output_waveform.mul_(gain_out)
+
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+
+
+def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
+    r"""Apply RIAA vinyl playback equalization.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz).
+            Allowed sample rates in Hz : ``44100``,``48000``,``88200``,``96000``
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+
+    if sample_rate == 44100:
+        zeros = [-0.2014898, 0.9233820]
+        poles = [0.7083149, 0.9924091]
+
+    elif sample_rate == 48000:
+        zeros = [-0.1766069, 0.9321590]
+        poles = [0.7396325, 0.9931330]
+
+    elif sample_rate == 88200:
+        zeros = [-0.1168735, 0.9648312]
+        poles = [0.8590646, 0.9964002]
+
+    elif sample_rate == 96000:
+        zeros = [-0.1141486, 0.9676817]
+        poles = [0.8699137, 0.9966946]
+
+    else:
+        raise ValueError("Sample rate must be 44.1k, 48k, 88.2k, or 96k")
+
+    # polynomial coefficients with roots zeros[0] and zeros[1]
+    b0 = 1.0
+    b1 = -(zeros[0] + zeros[1])
+    b2 = zeros[0] * zeros[1]
+
+    # polynomial coefficients with roots poles[0] and poles[1]
+    a0 = 1.0
+    a1 = -(poles[0] + poles[1])
+    a2 = poles[0] * poles[1]
+
+    # Normalize to 0dB at 1kHz
+    y = 2 * math.pi * 1000 / sample_rate
+    b_re = b0 + b1 * math.cos(-y) + b2 * math.cos(-2 * y)
+    a_re = a0 + a1 * math.cos(-y) + a2 * math.cos(-2 * y)
+    b_im = b1 * math.sin(-y) + b2 * math.sin(-2 * y)
+    a_im = a1 * math.sin(-y) + a2 * math.sin(-2 * y)
+    g = 1 / math.sqrt((b_re**2 + b_im**2) / (a_re**2 + a_im**2))
+
+    b0 *= g
+    b1 *= g
+    b2 *= g
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def treble_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    gain: float,
+    central_freq: float = 3000,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design a treble tone-control effect.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
+        central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``3000``)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    A = torch.exp(gain / 40 * math.log(10))
+
+    temp1 = 2 * torch.sqrt(A) * alpha
+    temp2 = (A - 1) * torch.cos(w0)
+    temp3 = (A + 1) * torch.cos(w0)
+
+    b0 = A * ((A + 1) + temp2 + temp1)
+    b1 = -2 * A * ((A - 1) + temp3)
+    b2 = A * ((A + 1) + temp2 - temp1)
+    a0 = (A + 1) - temp2 + temp1
+    a1 = 2 * ((A - 1) - temp3)
+    a2 = (A + 1) - temp2 - temp1
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _measure(
+    measure_len_ws: int,
+    samples: Tensor,
+    spectrum: Tensor,
+    noise_spectrum: Tensor,
+    spectrum_window: Tensor,
+    spectrum_start: int,
+    spectrum_end: int,
+    cepstrum_window: Tensor,
+    cepstrum_start: int,
+    cepstrum_end: int,
+    noise_reduction_amount: float,
+    measure_smooth_time_mult: float,
+    noise_up_time_mult: Tensor,
+    noise_down_time_mult: Tensor,
+    boot_count: int,
+) -> float:
+    device = samples.device
+
+    if spectrum.size(-1) != noise_spectrum.size(-1):
+        raise ValueError(
+            "Expected spectrum size to match noise spectrum size in final dimension."
+            f"Found: spectrum size: {spectrum.size()}, noise_spectrum size: {noise_spectrum.size()}"
+        )
+
+    dft_len_ws = spectrum.size()[-1]
+
+    dftBuf = torch.zeros(dft_len_ws, device=device)
+
+    dftBuf[:measure_len_ws] = samples * spectrum_window[:measure_len_ws]
+
+    # lsx_safe_rdft((int)p->dft_len_ws, 1, c->dftBuf);
+    _dftBuf = torch.fft.rfft(dftBuf)
+
+    mult: float = boot_count / (1.0 + boot_count) if boot_count >= 0 else measure_smooth_time_mult
+
+    _d = _dftBuf[spectrum_start:spectrum_end].abs()
+    spectrum[spectrum_start:spectrum_end].mul_(mult).add_(_d * (1 - mult))
+    _d = spectrum[spectrum_start:spectrum_end] ** 2
+
+    _zeros = torch.zeros(spectrum_end - spectrum_start, device=device)
+    _mult = (
+        _zeros
+        if boot_count >= 0
+        else torch.where(
+            _d > noise_spectrum[spectrum_start:spectrum_end],
+            noise_up_time_mult,  # if
+            noise_down_time_mult,  # else,
+        )
+    )
+
+    noise_spectrum[spectrum_start:spectrum_end].mul_(_mult).add_(_d * (1 - _mult))
+    _d = torch.sqrt(
+        torch.max(
+            _zeros,
+            _d - noise_reduction_amount * noise_spectrum[spectrum_start:spectrum_end],
+        ),
+    )
+
+    _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1, device=device)
+    _cepstrum_Buf[spectrum_start:spectrum_end] = _d * cepstrum_window
+    _cepstrum_Buf[spectrum_end : dft_len_ws >> 1].zero_()
+
+    # lsx_safe_rdft((int)p->dft_len_ws >> 1, 1, c->dftBuf);
+    _cepstrum_Buf = torch.fft.rfft(_cepstrum_Buf)
+
+    result: float = float(torch.sum(_cepstrum_Buf[cepstrum_start:cepstrum_end].abs().pow(2)))
+    result = math.log(result / (cepstrum_end - cepstrum_start)) if result > 0 else -math.inf
+    return max(0, 21 + result)
+
+
+def vad(
+    waveform: Tensor,
+    sample_rate: int,
+    trigger_level: float = 7.0,
+    trigger_time: float = 0.25,
+    search_time: float = 1.0,
+    allowed_gap: float = 0.25,
+    pre_trigger_time: float = 0.0,
+    # Fine-tuning parameters
+    boot_time: float = 0.35,
+    noise_up_time: float = 0.1,
+    noise_down_time: float = 0.01,
+    noise_reduction_amount: float = 1.35,
+    measure_freq: float = 20.0,
+    measure_duration: Optional[float] = None,
+    measure_smooth_time: float = 0.4,
+    hp_filter_freq: float = 50.0,
+    lp_filter_freq: float = 6000.0,
+    hp_lifter_freq: float = 150.0,
+    lp_lifter_freq: float = 2000.0,
+) -> Tensor:
+    r"""Voice Activity Detector. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
+    The algorithm currently uses a simple cepstral power measurement to detect voice,
+    so may be fooled by other things, especially music.
+
+    The effect can trim only from the front of the audio,
+    so in order to trim from the back, the reverse effect must also be used.
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
+            Tensor of shape `(channels, time)` is treated as a multi-channel recording
+            of the same event and the resulting output will be trimmed to the earliest
+            voice activity in any channel.
+        sample_rate (int): Sample rate of audio signal.
+        trigger_level (float, optional): The measurement level used to trigger activity detection.
+            This may need to be cahnged depending on the noise level, signal level,
+            and other characteristics of the input audio. (Default: 7.0)
+        trigger_time (float, optional): The time constant (in seconds)
+            used to help ignore short bursts of sound. (Default: 0.25)
+        search_time (float, optional): The amount of audio (in seconds)
+            to search for quieter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 1.0)
+        allowed_gap (float, optional): The allowed gap (in seconds) between
+            quieter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 0.25)
+        pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
+            before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
+        boot_time (float, optional) The algorithm (internally) uses adaptive noise
+            estimation/reduction in order to detect the start of the wanted audio.
+            This option sets the time for the initial noise estimate. (Default: 0.35)
+        noise_up_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is increasing. (Default: 0.1)
+        noise_down_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is decreasing. (Default: 0.01)
+        noise_reduction_amount (float, optional) Amount of noise reduction to use in
+            the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
+        measure_freq (float, optional) Frequency of the algorithm's
+            processing/measurements. (Default: 20.0)
+        measure_duration: (float, optional) Measurement duration.
+            (Default: Twice the measurement period; i.e. with overlap.)
+        measure_smooth_time (float, optional) Time constant used to smooth
+            spectral measurements. (Default: 0.4)
+        hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
+            at the input to the detector algorithm. (Default: 50.0)
+        lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
+            at the input to the detector algorithm. (Default: 6000.0)
+        hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
+            in the detector algorithm. (Default: 150.0)
+        lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
+            in the detector algorithm. (Default: 2000.0)
+
+    Returns:
+        Tensor: Tensor of audio of dimension `(..., time)`.
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    device = waveform.device
+
+    if waveform.ndim > 2:
+        warnings.warn(
+            "Expected input tensor dimension of 1 for single channel"
+            f" or 2 for multi-channel. Got {waveform.ndim} instead. "
+            "Batch semantics is not supported. "
+            "Please refer to https://github.com/pytorch/audio/issues/1348"
+            " and https://github.com/pytorch/audio/issues/1468."
+        )
+
+    measure_duration: float = 2.0 / measure_freq if measure_duration is None else measure_duration
+
+    measure_len_ws = int(sample_rate * measure_duration + 0.5)
+    measure_len_ns = measure_len_ws
+    # for (dft_len_ws = 16; dft_len_ws < measure_len_ws; dft_len_ws <<= 1);
+    dft_len_ws = 16
+    while dft_len_ws < measure_len_ws:
+        dft_len_ws *= 2
+
+    measure_period_ns = int(sample_rate / measure_freq + 0.5)
+    measures_len = math.ceil(search_time * measure_freq)
+    search_pre_trigger_len_ns = measures_len * measure_period_ns
+    gap_len = int(allowed_gap * measure_freq + 0.5)
+
+    fixed_pre_trigger_len_ns = int(pre_trigger_time * sample_rate + 0.5)
+    samplesLen_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns
+
+    spectrum_window = torch.zeros(measure_len_ws, device=device)
+    for i in range(measure_len_ws):
+        # sox.h:741 define SOX_SAMPLE_MIN (sox_sample_t)SOX_INT_MIN(32)
+        spectrum_window[i] = 2.0 / math.sqrt(float(measure_len_ws))
+    # lsx_apply_hann(spectrum_window, (int)measure_len_ws);
+    spectrum_window *= torch.hann_window(measure_len_ws, device=device, dtype=torch.float)
+
+    spectrum_start: int = int(hp_filter_freq / sample_rate * dft_len_ws + 0.5)
+    spectrum_start: int = max(spectrum_start, 1)
+    spectrum_end: int = int(lp_filter_freq / sample_rate * dft_len_ws + 0.5)
+    spectrum_end: int = min(spectrum_end, dft_len_ws // 2)
+
+    cepstrum_window = torch.zeros(spectrum_end - spectrum_start, device=device)
+    for i in range(spectrum_end - spectrum_start):
+        cepstrum_window[i] = 2.0 / math.sqrt(float(spectrum_end) - spectrum_start)
+    # lsx_apply_hann(cepstrum_window,(int)(spectrum_end - spectrum_start));
+    cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, device=device, dtype=torch.float)
+
+    cepstrum_start = math.ceil(sample_rate * 0.5 / lp_lifter_freq)
+    cepstrum_end = math.floor(sample_rate * 0.5 / hp_lifter_freq)
+    cepstrum_end = min(cepstrum_end, dft_len_ws // 4)
+
+    if cepstrum_end <= cepstrum_start:
+        raise ValueError(
+            "Expected cepstrum_start to be smaller than cepstrum_end."
+            f"Found: cepstrum_start: {cepstrum_start}, cepstrum_end: {cepstrum_end}."
+        )
+
+    noise_up_time_mult = torch.tensor(math.exp(-1.0 / (noise_up_time * measure_freq)), device=device)
+    noise_down_time_mult = torch.tensor(math.exp(-1.0 / (noise_down_time * measure_freq)), device=device)
+    measure_smooth_time_mult = math.exp(-1.0 / (measure_smooth_time * measure_freq))
+    trigger_meas_time_mult = math.exp(-1.0 / (trigger_time * measure_freq))
+
+    boot_count_max = int(boot_time * measure_freq - 0.5)
+    boot_count = measures_index = flushedLen_ns = 0
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.view(-1, shape[-1])
+
+    n_channels, ilen = waveform.size()
+
+    mean_meas = torch.zeros(n_channels, device=device)
+    spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
+    noise_spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
+    measures = torch.zeros(n_channels, measures_len, device=device)
+
+    has_triggered: bool = False
+    num_measures_to_flush: int = 0
+
+    pos = 0
+    for pos in range(measure_len_ns, ilen, measure_period_ns):
+        for i in range(n_channels):
+            meas: float = _measure(
+                measure_len_ws=measure_len_ws,
+                samples=waveform[i, pos - measure_len_ws : pos],
+                spectrum=spectrum[i],
+                noise_spectrum=noise_spectrum[i],
+                spectrum_window=spectrum_window,
+                spectrum_start=spectrum_start,
+                spectrum_end=spectrum_end,
+                cepstrum_window=cepstrum_window,
+                cepstrum_start=cepstrum_start,
+                cepstrum_end=cepstrum_end,
+                noise_reduction_amount=noise_reduction_amount,
+                measure_smooth_time_mult=measure_smooth_time_mult,
+                noise_up_time_mult=noise_up_time_mult,
+                noise_down_time_mult=noise_down_time_mult,
+                boot_count=boot_count,
+            )
+            measures[i, measures_index] = meas
+            mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
+
+            has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
+            if has_triggered:
+                n: int = measures_len
+                k: int = measures_index
+                jTrigger: int = n
+                jZero: int = n
+                j: int = 0
+
+                for j in range(n):
+                    if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
+                        jZero = jTrigger = j
+                    elif (measures[i, k] == 0) and (jTrigger >= jZero):
+                        jZero = j
+                    k = (k + n - 1) % n
+                j = min(j, jZero)
+                # num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
+                num_measures_to_flush = min(max(num_measures_to_flush, j), n)
+            # end if has_triggered
+        # end for channel
+        measures_index += 1
+        measures_index = measures_index % measures_len
+        if boot_count >= 0:
+            boot_count = -1 if boot_count == boot_count_max else boot_count + 1
+
+        if has_triggered:
+            flushedLen_ns = (measures_len - num_measures_to_flush) * measure_period_ns
+            break
+    # end for window
+    if not has_triggered:
+        return waveform[..., :0].view(shape[:-1] + torch.Size([0]))
+
+    res = waveform[:, pos - samplesLen_ns + flushedLen_ns :]
+    # unpack batch
+    return res.view(shape[:-1] + res.shape[-1:])
diff --git a/MLPY/Lib/site-packages/torchaudio/functional/functional.py b/MLPY/Lib/site-packages/torchaudio/functional/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7551662be4c13e415b7f23e2e8fbcee6d0c5e9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/functional/functional.py
@@ -0,0 +1,2535 @@
+# -*- coding: utf-8 -*-
+
+import math
+import tempfile
+import warnings
+from collections.abc import Sequence
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torchaudio
+from torch import Tensor
+from torchaudio._internal.module_utils import deprecated
+
+from .filtering import highpass_biquad, treble_biquad
+
+__all__ = [
+    "spectrogram",
+    "inverse_spectrogram",
+    "griffinlim",
+    "amplitude_to_DB",
+    "DB_to_amplitude",
+    "compute_deltas",
+    "melscale_fbanks",
+    "linear_fbanks",
+    "create_dct",
+    "compute_deltas",
+    "detect_pitch_frequency",
+    "DB_to_amplitude",
+    "mu_law_encoding",
+    "mu_law_decoding",
+    "phase_vocoder",
+    "mask_along_axis",
+    "mask_along_axis_iid",
+    "sliding_window_cmn",
+    "spectral_centroid",
+    "apply_codec",
+    "resample",
+    "edit_distance",
+    "loudness",
+    "pitch_shift",
+    "rnnt_loss",
+    "psd",
+    "mvdr_weights_souden",
+    "mvdr_weights_rtf",
+    "rtf_evd",
+    "rtf_power",
+    "apply_beamforming",
+    "fftconvolve",
+    "convolve",
+    "add_noise",
+    "speed",
+    "preemphasis",
+    "deemphasis",
+]
+
+
+def spectrogram(
+    waveform: Tensor,
+    pad: int,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    power: Optional[float],
+    normalized: Union[bool, str],
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+    return_complex: Optional[bool] = None,
+) -> Tensor:
+    r"""Create a spectrogram or a batch of spectrograms from a raw audio signal.
+    The spectrogram can be either magnitude-only or complex.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(..., time)`
+        pad (int): Two sided padding of signal
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT
+        hop_length (int): Length of hop between STFT windows
+        win_length (int): Window size
+        power (float or None): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
+            If None, then the complex spectrum is returned instead.
+        normalized (bool or str): Whether to normalize by magnitude after stft. If input is str, choices are
+            ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
+            ``"window"``. When normalized on ``"window"``, waveform is normalized upon the window's L2 energy. If
+            normalized on ``"frame_length"``, waveform is normalized by dividing by
+            :math:`(\text{frame\_length})^{0.5}`.
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            Default: ``True``
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. Default: ``"reflect"``
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy. Default: ``True``
+        return_complex (bool, optional):
+            Deprecated and not used.
+
+    Returns:
+        Tensor: Dimension `(..., freq, time)`, freq is
+        ``n_fft // 2 + 1`` and ``n_fft`` is the number of
+        Fourier bins, and time is the number of window hops (n_frame).
+    """
+    if return_complex is not None:
+        warnings.warn(
+            "`return_complex` argument is now deprecated and is not effective."
+            "`torchaudio.functional.spectrogram(power=None)` always returns a tensor with "
+            "complex dtype. Please remove the argument in the function call."
+        )
+
+    if pad > 0:
+        # TODO add "with torch.no_grad():" back when JIT supports it
+        waveform = torch.nn.functional.pad(waveform, (pad, pad), "constant")
+
+    frame_length_norm, window_norm = _get_spec_norms(normalized)
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+
+    # default values are consistent with librosa.core.spectrum._spectrogram
+    spec_f = torch.stft(
+        input=waveform,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode,
+        normalized=frame_length_norm,
+        onesided=onesided,
+        return_complex=True,
+    )
+
+    # unpack batch
+    spec_f = spec_f.reshape(shape[:-1] + spec_f.shape[-2:])
+
+    if window_norm:
+        spec_f /= window.pow(2.0).sum().sqrt()
+    if power is not None:
+        if power == 1.0:
+            return spec_f.abs()
+        return spec_f.abs().pow(power)
+    return spec_f
+
+
+def inverse_spectrogram(
+    spectrogram: Tensor,
+    length: Optional[int],
+    pad: int,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    normalized: Union[bool, str],
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+) -> Tensor:
+    r"""Create an inverse spectrogram or a batch of inverse spectrograms from the provided
+    complex-valued spectrogram.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time).
+        length (int or None): The output length of the waveform.
+        pad (int): Two sided padding of signal. It is only effective when ``length`` is provided.
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT
+        hop_length (int): Length of hop between STFT windows
+        win_length (int): Window size
+        normalized (bool or str): Whether the stft output was normalized by magnitude. If input is str, choices are
+            ``"window"`` and ``"frame_length"``, dependent on normalization mode. ``True`` maps to
+            ``"window"``.
+        center (bool, optional): whether the waveform was padded on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            Default: ``True``
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. This parameter is provided for compatibility with the
+            spectrogram function and is not used. Default: ``"reflect"``
+        onesided (bool, optional): controls whether spectrogram was done in onesided mode.
+            Default: ``True``
+
+    Returns:
+        Tensor: Dimension `(..., time)`. Least squares estimation of the original signal.
+    """
+
+    frame_length_norm, window_norm = _get_spec_norms(normalized)
+
+    if not spectrogram.is_complex():
+        raise ValueError("Expected `spectrogram` to be complex dtype.")
+
+    if window_norm:
+        spectrogram = spectrogram * window.pow(2.0).sum().sqrt()
+
+    # pack batch
+    shape = spectrogram.size()
+    spectrogram = spectrogram.reshape(-1, shape[-2], shape[-1])
+
+    # default values are consistent with librosa.core.spectrum._spectrogram
+    waveform = torch.istft(
+        input=spectrogram,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        center=center,
+        normalized=frame_length_norm,
+        onesided=onesided,
+        length=length + 2 * pad if length is not None else None,
+        return_complex=False,
+    )
+
+    if length is not None and pad > 0:
+        # remove padding from front and back
+        waveform = waveform[:, pad:-pad]
+
+    # unpack batch
+    waveform = waveform.reshape(shape[:-2] + waveform.shape[-1:])
+
+    return waveform
+
+
+def _get_spec_norms(normalized: Union[str, bool]):
+    frame_length_norm, window_norm = False, False
+    if torch.jit.isinstance(normalized, str):
+        if normalized not in ["frame_length", "window"]:
+            raise ValueError("Invalid normalized parameter: {}".format(normalized))
+        if normalized == "frame_length":
+            frame_length_norm = True
+        elif normalized == "window":
+            window_norm = True
+    elif torch.jit.isinstance(normalized, bool):
+        if normalized:
+            window_norm = True
+    else:
+        raise TypeError("Input type not supported")
+    return frame_length_norm, window_norm
+
+
+def _get_complex_dtype(real_dtype: torch.dtype):
+    if real_dtype == torch.double:
+        return torch.cdouble
+    if real_dtype == torch.float:
+        return torch.cfloat
+    if real_dtype == torch.half:
+        return torch.complex32
+    raise ValueError(f"Unexpected dtype {real_dtype}")
+
+
+def griffinlim(
+    specgram: Tensor,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    power: float,
+    n_iter: int,
+    momentum: float,
+    length: Optional[int],
+    rand_init: bool,
+) -> Tensor:
+    r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Implementation ported from
+    *librosa* :cite:`brian_mcfee-proc-scipy-2015`, *A fast Griffin-Lim algorithm* :cite:`6701851`
+    and *Signal estimation from modified short-time Fourier transform* :cite:`1172092`.
+
+    Args:
+        specgram (Tensor): A magnitude-only STFT spectrogram of dimension `(..., freq, frames)`
+            where freq is ``n_fft // 2 + 1``.
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins
+        hop_length (int): Length of hop between STFT windows. (
+            Default: ``win_length // 2``)
+        win_length (int): Window size. (Default: ``n_fft``)
+        power (float): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
+        n_iter (int): Number of iteration for phase recovery process.
+        momentum (float): The momentum parameter for fast Griffin-Lim.
+            Setting this to 0 recovers the original Griffin-Lim method.
+            Values near 1 can lead to faster convergence, but above 1 may not converge.
+        length (int or None): Array length of the expected output.
+        rand_init (bool): Initializes phase randomly if True, to zero otherwise.
+
+    Returns:
+        Tensor: waveform of `(..., time)`, where time equals the ``length`` parameter if given.
+    """
+    if not 0 <= momentum < 1:
+        raise ValueError("momentum must be in range [0, 1). Found: {}".format(momentum))
+
+    momentum = momentum / (1 + momentum)
+
+    # pack batch
+    shape = specgram.size()
+    specgram = specgram.reshape([-1] + list(shape[-2:]))
+
+    specgram = specgram.pow(1 / power)
+
+    # initialize the phase
+    if rand_init:
+        angles = torch.rand(specgram.size(), dtype=_get_complex_dtype(specgram.dtype), device=specgram.device)
+    else:
+        angles = torch.full(specgram.size(), 1, dtype=_get_complex_dtype(specgram.dtype), device=specgram.device)
+
+    # And initialize the previous iterate to 0
+    tprev = torch.tensor(0.0, dtype=specgram.dtype, device=specgram.device)
+    for _ in range(n_iter):
+        # Invert with our current estimate of the phases
+        inverse = torch.istft(
+            specgram * angles, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=length
+        )
+
+        # Rebuild the spectrogram
+        rebuilt = torch.stft(
+            input=inverse,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=True,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+
+        # Update our phase estimates
+        angles = rebuilt
+        if momentum:
+            angles = angles - tprev.mul_(momentum)
+        angles = angles.div(angles.abs().add(1e-16))
+
+        # Store the previous iterate
+        tprev = rebuilt
+
+    # Return the final phase estimates
+    waveform = torch.istft(
+        specgram * angles, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=length
+    )
+
+    # unpack batch
+    waveform = waveform.reshape(shape[:-2] + waveform.shape[-1:])
+
+    return waveform
+
+
+def amplitude_to_DB(
+    x: Tensor, multiplier: float, amin: float, db_multiplier: float, top_db: Optional[float] = None
+) -> Tensor:
+    r"""Turn a spectrogram from the power/amplitude scale to the decibel scale.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The output of each tensor in a batch depends on the maximum value of that tensor,
+    and so may return different values for an audio clip split into snippets vs. a full clip.
+
+    Args:
+
+        x (Tensor): Input spectrogram(s) before being converted to decibel scale.
+            The expected shapes are ``(freq, time)``, ``(channel, freq, time)`` or
+            ``(..., batch, channel, freq, time)``.
+
+            .. note::
+
+               When ``top_db`` is specified, cut-off values are computed for each audio
+               in the batch. Therefore if the input shape is 4D (or larger), different
+               cut-off values are used for audio data in the batch.
+               If the input shape is 2D or 3D, a single cutoff value is used.
+
+        multiplier (float): Use 10. for power and 20. for amplitude
+        amin (float): Number to clamp ``x``
+        db_multiplier (float): Log10(max(reference value and amin))
+        top_db (float or None, optional): Minimum negative cut-off in decibels. A reasonable number
+            is 80. (Default: ``None``)
+
+    Returns:
+        Tensor: Output tensor in decibel scale
+    """
+    x_db = multiplier * torch.log10(torch.clamp(x, min=amin))
+    x_db -= multiplier * db_multiplier
+
+    if top_db is not None:
+        # Expand batch
+        shape = x_db.size()
+        packed_channels = shape[-3] if x_db.dim() > 2 else 1
+        x_db = x_db.reshape(-1, packed_channels, shape[-2], shape[-1])
+
+        x_db = torch.max(x_db, (x_db.amax(dim=(-3, -2, -1)) - top_db).view(-1, 1, 1, 1))
+
+        # Repack batch
+        x_db = x_db.reshape(shape)
+
+    return x_db
+
+
+def DB_to_amplitude(x: Tensor, ref: float, power: float) -> Tensor:
+    r"""Turn a tensor from the decibel scale to the power/amplitude scale.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        x (Tensor): Input tensor before being converted to power/amplitude scale.
+        ref (float): Reference which the output will be scaled by.
+        power (float): If power equals 1, will compute DB to power. If 0.5, will compute DB to amplitude.
+
+    Returns:
+        Tensor: Output tensor in power/amplitude scale.
+    """
+    return ref * torch.pow(torch.pow(10.0, 0.1 * x), power)
+
+
+def _hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
+    r"""Convert Hz to Mels.
+
+    Args:
+        freqs (float): Frequencies in Hz
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        mels (float): Frequency in Mels
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 2595.0 * math.log10(1.0 + (freq / 700.0))
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+
+    if freq >= min_log_hz:
+        mels = min_log_mel + math.log(freq / min_log_hz) / logstep
+
+    return mels
+
+
+def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor:
+    """Convert mel bin numbers to frequencies.
+
+    Args:
+        mels (Tensor): Mel frequencies
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        freqs (Tensor): Mels converted in Hz
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mels
+
+    # And now the nonlinear scale
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+
+    log_t = mels >= min_log_mel
+    freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel))
+
+    return freqs
+
+
+def _create_triangular_filterbank(
+    all_freqs: Tensor,
+    f_pts: Tensor,
+) -> Tensor:
+    """Create a triangular filter bank.
+
+    Args:
+        all_freqs (Tensor): STFT freq points of size (`n_freqs`).
+        f_pts (Tensor): Filter mid points of size (`n_filter`).
+
+    Returns:
+        fb (Tensor): The filter bank of size (`n_freqs`, `n_filter`).
+    """
+    # Adopted from Librosa
+    # calculate the difference between each filter mid point and each stft freq point in hertz
+    f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
+    slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)  # (n_freqs, n_filter + 2)
+    # create overlapping triangles
+    zero = torch.zeros(1)
+    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
+    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_filter)
+    fb = torch.max(zero, torch.min(down_slopes, up_slopes))
+
+    return fb
+
+
+def melscale_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_mels: int,
+    sample_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+) -> Tensor:
+    r"""Create a frequency bin conversion matrix.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        For the sake of the numerical compatibility with librosa, not all the coefficients
+        in the resulting filter bank has magnitude of 1.
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/mel_fbanks.png
+           :alt: Visualization of generated filter bank
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_mels (int): Number of mel filterbanks
+        sample_rate (int): Sample rate of the audio waveform
+        norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A @ melscale_fbanks(A.size(-1), ...)``.
+
+    """
+
+    if norm is not None and norm != "slaney":
+        raise ValueError('norm must be one of None or "slaney"')
+
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+
+    # calculate mel freq bins
+    m_min = _hz_to_mel(f_min, mel_scale=mel_scale)
+    m_max = _hz_to_mel(f_max, mel_scale=mel_scale)
+
+    m_pts = torch.linspace(m_min, m_max, n_mels + 2)
+    f_pts = _mel_to_hz(m_pts, mel_scale=mel_scale)
+
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
+        fb *= enorm.unsqueeze(0)
+
+    if (fb.max(dim=0).values == 0.0).any():
+        warnings.warn(
+            "At least one mel filterbank has all zero values. "
+            f"The value for `n_mels` ({n_mels}) may be set too high. "
+            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+        )
+
+    return fb
+
+
+def linear_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_filter: int,
+    sample_rate: int,
+) -> Tensor:
+    r"""Creates a linear triangular filterbank.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        For the sake of the numerical compatibility with librosa, not all the coefficients
+        in the resulting filter bank has magnitude of 1.
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/lin_fbanks.png
+           :alt: Visualization of generated filter bank
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_filter (int): Number of (linear) triangular filter
+        sample_rate (int): Sample rate of the audio waveform
+
+    Returns:
+        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_filter``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A * linear_fbanks(A.size(-1), ...)``.
+    """
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+
+    # filter mid-points
+    f_pts = torch.linspace(f_min, f_max, n_filter + 2)
+
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+
+    return fb
+
+
+def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor:
+    r"""Create a DCT transformation matrix with shape (``n_mels``, ``n_mfcc``),
+    normalized depending on norm.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Args:
+        n_mfcc (int): Number of mfc coefficients to retain
+        n_mels (int): Number of mel filterbanks
+        norm (str or None): Norm to use (either "ortho" or None)
+
+    Returns:
+        Tensor: The transformation matrix, to be right-multiplied to
+        row-wise data of size (``n_mels``, ``n_mfcc``).
+    """
+
+    if norm is not None and norm != "ortho":
+        raise ValueError('norm must be either "ortho" or None')
+
+    # http://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II
+    n = torch.arange(float(n_mels))
+    k = torch.arange(float(n_mfcc)).unsqueeze(1)
+    dct = torch.cos(math.pi / float(n_mels) * (n + 0.5) * k)  # size (n_mfcc, n_mels)
+
+    if norm is None:
+        dct *= 2.0
+    else:
+        dct[0] *= 1.0 / math.sqrt(2.0)
+        dct *= math.sqrt(2.0 / float(n_mels))
+    return dct.t()
+
+
+def mu_law_encoding(x: Tensor, quantization_channels: int) -> Tensor:
+    r"""Encode signal based on mu-law companding.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    For more info see the
+    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
+
+    This algorithm expects the signal has been scaled to between -1 and 1 and
+    returns a signal encoded with values from 0 to quantization_channels - 1.
+
+    Args:
+        x (Tensor): Input tensor
+        quantization_channels (int): Number of channels
+
+    Returns:
+        Tensor: Input after mu-law encoding
+    """
+    mu = quantization_channels - 1.0
+    if not x.is_floating_point():
+        warnings.warn(
+            "The input Tensor must be of floating type. \
+            This will be an error in the v0.12 release."
+        )
+        x = x.to(torch.float)
+    mu = torch.tensor(mu, dtype=x.dtype)
+    x_mu = torch.sign(x) * torch.log1p(mu * torch.abs(x)) / torch.log1p(mu)
+    x_mu = ((x_mu + 1) / 2 * mu + 0.5).to(torch.int64)
+    return x_mu
+
+
+def mu_law_decoding(x_mu: Tensor, quantization_channels: int) -> Tensor:
+    r"""Decode mu-law encoded signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    For more info see the
+    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
+
+    This expects an input with values between 0 and quantization_channels - 1
+    and returns a signal scaled between -1 and 1.
+
+    Args:
+        x_mu (Tensor): Input tensor
+        quantization_channels (int): Number of channels
+
+    Returns:
+        Tensor: Input after mu-law decoding
+    """
+    mu = quantization_channels - 1.0
+    if not x_mu.is_floating_point():
+        x_mu = x_mu.to(torch.float)
+    mu = torch.tensor(mu, dtype=x_mu.dtype)
+    x = ((x_mu) / mu) * 2 - 1.0
+    x = torch.sign(x) * (torch.exp(torch.abs(x) * torch.log1p(mu)) - 1.0) / mu
+    return x
+
+
+def phase_vocoder(complex_specgrams: Tensor, rate: float, phase_advance: Tensor) -> Tensor:
+    r"""Given a STFT tensor, speed up in time without modifying pitch by a factor of ``rate``.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        complex_specgrams (Tensor):
+            A tensor of dimension `(..., freq, num_frame)` with complex dtype.
+        rate (float): Speed-up factor
+        phase_advance (Tensor): Expected phase advance in each bin. Dimension of `(freq, 1)`
+
+    Returns:
+        Tensor:
+            Stretched spectrogram. The resulting tensor is of the same dtype as the input
+            spectrogram, but the number of frames is changed to ``ceil(num_frame / rate)``.
+
+    Example
+        >>> freq, hop_length = 1025, 512
+        >>> # (channel, freq, time)
+        >>> complex_specgrams = torch.randn(2, freq, 300, dtype=torch.cfloat)
+        >>> rate = 1.3 # Speed up by 30%
+        >>> phase_advance = torch.linspace(
+        >>>    0, math.pi * hop_length, freq)[..., None]
+        >>> x = phase_vocoder(complex_specgrams, rate, phase_advance)
+        >>> x.shape # with 231 == ceil(300 / 1.3)
+        torch.Size([2, 1025, 231])
+    """
+    if rate == 1.0:
+        return complex_specgrams
+
+    # pack batch
+    shape = complex_specgrams.size()
+    complex_specgrams = complex_specgrams.reshape([-1] + list(shape[-2:]))
+
+    # Figures out the corresponding real dtype, i.e. complex128 -> float64, complex64 -> float32
+    # Note torch.real is a view so it does not incur any memory copy.
+    real_dtype = torch.real(complex_specgrams).dtype
+    time_steps = torch.arange(0, complex_specgrams.size(-1), rate, device=complex_specgrams.device, dtype=real_dtype)
+
+    alphas = time_steps % 1.0
+    phase_0 = complex_specgrams[..., :1].angle()
+
+    # Time Padding
+    complex_specgrams = torch.nn.functional.pad(complex_specgrams, [0, 2])
+
+    # (new_bins, freq, 2)
+    complex_specgrams_0 = complex_specgrams.index_select(-1, time_steps.long())
+    complex_specgrams_1 = complex_specgrams.index_select(-1, (time_steps + 1).long())
+
+    angle_0 = complex_specgrams_0.angle()
+    angle_1 = complex_specgrams_1.angle()
+
+    norm_0 = complex_specgrams_0.abs()
+    norm_1 = complex_specgrams_1.abs()
+
+    phase = angle_1 - angle_0 - phase_advance
+    phase = phase - 2 * math.pi * torch.round(phase / (2 * math.pi))
+
+    # Compute Phase Accum
+    phase = phase + phase_advance
+    phase = torch.cat([phase_0, phase[..., :-1]], dim=-1)
+    phase_acc = torch.cumsum(phase, -1)
+
+    mag = alphas * norm_1 + (1 - alphas) * norm_0
+
+    complex_specgrams_stretch = torch.polar(mag, phase_acc)
+
+    # unpack batch
+    complex_specgrams_stretch = complex_specgrams_stretch.reshape(shape[:-2] + complex_specgrams_stretch.shape[1:])
+    return complex_specgrams_stretch
+
+
+def _get_mask_param(mask_param: int, p: float, axis_length: int) -> int:
+    if p == 1.0:
+        return mask_param
+    else:
+        return min(mask_param, int(axis_length * p))
+
+
+def mask_along_axis_iid(
+    specgrams: Tensor,
+    mask_param: int,
+    mask_value: float,
+    axis: int,
+    p: float = 1.0,
+) -> Tensor:
+    r"""Apply a mask along ``axis``.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Mask will be applied from indices ``[v_0, v_0 + v)``,
+    where ``v`` is sampled from ``uniform(0, max_v)`` and
+    ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``,
+    with ``max_v = mask_param`` when ``p = 1.0`` and
+    ``max_v = min(mask_param, floor(specgrams.size(axis) * p))`` otherwise.
+
+    Args:
+        specgrams (Tensor): Real spectrograms `(..., freq, time)`, with at least 3 dimensions.
+        mask_param (int): Number of columns to be masked will be uniformly sampled from [0, mask_param]
+        mask_value (float): Value to assign to the masked columns
+        axis (int): Axis to apply masking on, which should be the one of the last two dimensions.
+        p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
+
+    Returns:
+        Tensor: Masked spectrograms with the same dimensions as input specgrams Tensor`
+    """
+
+    dim = specgrams.dim()
+
+    if dim < 3:
+        raise ValueError(f"Spectrogram must have at least three dimensions ({dim} given).")
+
+    if axis not in [dim - 2, dim - 1]:
+        raise ValueError(
+            f"Only Frequency and Time masking are supported (axis {dim-2} and axis {dim-1} supported; {axis} given)."
+        )
+
+    if not 0.0 <= p <= 1.0:
+        raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
+
+    mask_param = _get_mask_param(mask_param, p, specgrams.shape[axis])
+    if mask_param < 1:
+        return specgrams
+
+    device = specgrams.device
+    dtype = specgrams.dtype
+
+    value = torch.rand(specgrams.shape[: (dim - 2)], device=device, dtype=dtype) * mask_param
+    min_value = torch.rand(specgrams.shape[: (dim - 2)], device=device, dtype=dtype) * (specgrams.size(axis) - value)
+
+    # Create broadcastable mask
+    mask_start = min_value.long()[..., None, None]
+    mask_end = (min_value.long() + value.long())[..., None, None]
+    mask = torch.arange(0, specgrams.size(axis), device=device, dtype=dtype)
+
+    # Per batch example masking
+    specgrams = specgrams.transpose(axis, -1)
+    specgrams = specgrams.masked_fill((mask >= mask_start) & (mask < mask_end), mask_value)
+    specgrams = specgrams.transpose(axis, -1)
+
+    return specgrams
+
+
+def mask_along_axis(
+    specgram: Tensor,
+    mask_param: int,
+    mask_value: float,
+    axis: int,
+    p: float = 1.0,
+) -> Tensor:
+    r"""Apply a mask along ``axis``.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Mask will be applied from indices ``[v_0, v_0 + v)``,
+    where ``v`` is sampled from ``uniform(0, max_v)`` and
+    ``v_0`` from ``uniform(0, specgram.size(axis) - v)``, with
+    ``max_v = mask_param`` when ``p = 1.0`` and
+    ``max_v = min(mask_param, floor(specgram.size(axis) * p))``
+    otherwise.
+    All examples will have the same mask interval.
+
+    Args:
+        specgram (Tensor): Real spectrograms `(..., freq, time)`, with at least 2 dimensions.
+        mask_param (int): Number of columns to be masked will be uniformly sampled from [0, mask_param]
+        mask_value (float): Value to assign to the masked columns
+        axis (int): Axis to apply masking on, which should be the one of the last two dimensions.
+        p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
+
+    Returns:
+        Tensor: Masked spectrograms with the same dimensions as input specgram Tensor
+    """
+    dim = specgram.dim()
+
+    if dim < 2:
+        raise ValueError(f"Spectrogram must have at least two dimensions (time and frequency) ({dim} given).")
+
+    if axis not in [dim - 2, dim - 1]:
+        raise ValueError(
+            f"Only Frequency and Time masking are supported (axis {dim-2} and axis {dim-1} supported; {axis} given)."
+        )
+
+    if not 0.0 <= p <= 1.0:
+        raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
+
+    mask_param = _get_mask_param(mask_param, p, specgram.shape[axis])
+    if mask_param < 1:
+        return specgram
+
+    # pack batch
+    shape = specgram.size()
+    specgram = specgram.reshape([-1] + list(shape[-2:]))
+    # After packing, specgram is a 3D tensor, and the axis corresponding to the to-be-masked dimension
+    # is now (axis - dim + 3), e.g. a tensor of shape (10, 2, 50, 10, 2) becomes a tensor of shape (1000, 10, 2).
+    value = torch.rand(1) * mask_param
+    min_value = torch.rand(1) * (specgram.size(axis - dim + 3) - value)
+
+    mask_start = (min_value.long()).squeeze()
+    mask_end = (min_value.long() + value.long()).squeeze()
+    mask = torch.arange(0, specgram.shape[axis - dim + 3], device=specgram.device, dtype=specgram.dtype)
+    mask = (mask >= mask_start) & (mask < mask_end)
+    # unsqueeze the mask if the axis is frequency
+    if axis == dim - 2:
+        mask = mask.unsqueeze(-1)
+
+    if mask_end - mask_start >= mask_param:
+        raise ValueError("Number of columns to be masked should be less than mask_param")
+
+    specgram = specgram.masked_fill(mask, mask_value)
+
+    # unpack batch
+    specgram = specgram.reshape(shape[:-2] + specgram.shape[-2:])
+
+    return specgram
+
+
+def compute_deltas(specgram: Tensor, win_length: int = 5, mode: str = "replicate") -> Tensor:
+    r"""Compute delta coefficients of a tensor, usually a spectrogram:
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    .. math::
+       d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2}
+
+    where :math:`d_t` is the deltas at time :math:`t`,
+    :math:`c_t` is the spectrogram coeffcients at time :math:`t`,
+    :math:`N` is ``(win_length-1)//2``.
+
+    Args:
+        specgram (Tensor): Tensor of audio of dimension `(..., freq, time)`
+        win_length (int, optional): The window length used for computing delta (Default: ``5``)
+        mode (str, optional): Mode parameter passed to padding (Default: ``"replicate"``)
+
+    Returns:
+        Tensor: Tensor of deltas of dimension `(..., freq, time)`
+
+    Example
+        >>> specgram = torch.randn(1, 40, 1000)
+        >>> delta = compute_deltas(specgram)
+        >>> delta2 = compute_deltas(delta)
+    """
+    device = specgram.device
+    dtype = specgram.dtype
+
+    # pack batch
+    shape = specgram.size()
+    specgram = specgram.reshape(1, -1, shape[-1])
+
+    if win_length < 3:
+        raise ValueError(f"Window length should be greater than or equal to 3. Found win_length {win_length}")
+
+    n = (win_length - 1) // 2
+
+    # twice sum of integer squared
+    denom = n * (n + 1) * (2 * n + 1) / 3
+
+    specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)
+
+    kernel = torch.arange(-n, n + 1, 1, device=device, dtype=dtype).repeat(specgram.shape[1], 1, 1)
+
+    output = torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom
+
+    # unpack batch
+    output = output.reshape(shape)
+
+    return output
+
+
+def _compute_nccf(waveform: Tensor, sample_rate: int, frame_time: float, freq_low: int) -> Tensor:
+    r"""
+    Compute Normalized Cross-Correlation Function (NCCF).
+
+    .. math::
+        \phi_i(m) = \frac{\sum_{n=b_i}^{b_i + N-1} w(n) w(m+n)}{\sqrt{E(b_i) E(m+b_i)}},
+
+    where
+    :math:`\phi_i(m)` is the NCCF at frame :math:`i` with lag :math:`m`,
+    :math:`w` is the waveform,
+    :math:`N` is the length of a frame,
+    :math:`b_i` is the beginning of frame :math:`i`,
+    :math:`E(j)` is the energy :math:`\sum_{n=j}^{j+N-1} w^2(n)`.
+    """
+
+    EPSILON = 10 ** (-9)
+
+    # Number of lags to check
+    lags = int(math.ceil(sample_rate / freq_low))
+
+    frame_size = int(math.ceil(sample_rate * frame_time))
+
+    waveform_length = waveform.size()[-1]
+    num_of_frames = int(math.ceil(waveform_length / frame_size))
+
+    p = lags + num_of_frames * frame_size - waveform_length
+    waveform = torch.nn.functional.pad(waveform, (0, p))
+
+    # Compute lags
+    output_lag = []
+    for lag in range(1, lags + 1):
+        s1 = waveform[..., :-lag].unfold(-1, frame_size, frame_size)[..., :num_of_frames, :]
+        s2 = waveform[..., lag:].unfold(-1, frame_size, frame_size)[..., :num_of_frames, :]
+
+        output_frames = (
+            (s1 * s2).sum(-1)
+            / (EPSILON + torch.linalg.vector_norm(s1, ord=2, dim=-1)).pow(2)
+            / (EPSILON + torch.linalg.vector_norm(s2, ord=2, dim=-1)).pow(2)
+        )
+
+        output_lag.append(output_frames.unsqueeze(-1))
+
+    nccf = torch.cat(output_lag, -1)
+
+    return nccf
+
+
+def _combine_max(a: Tuple[Tensor, Tensor], b: Tuple[Tensor, Tensor], thresh: float = 0.99) -> Tuple[Tensor, Tensor]:
+    """
+    Take value from first if bigger than a multiplicative factor of the second, elementwise.
+    """
+    mask = a[0] > thresh * b[0]
+    values = mask * a[0] + ~mask * b[0]
+    indices = mask * a[1] + ~mask * b[1]
+    return values, indices
+
+
+def _find_max_per_frame(nccf: Tensor, sample_rate: int, freq_high: int) -> Tensor:
+    r"""
+    For each frame, take the highest value of NCCF,
+    apply centered median smoothing, and convert to frequency.
+
+    Note: If the max among all the lags is very close
+    to the first half of lags, then the latter is taken.
+    """
+
+    lag_min = int(math.ceil(sample_rate / freq_high))
+
+    # Find near enough max that is smallest
+
+    best = torch.max(nccf[..., lag_min:], -1)
+
+    half_size = nccf.shape[-1] // 2
+    half = torch.max(nccf[..., lag_min:half_size], -1)
+
+    best = _combine_max(half, best)
+    indices = best[1]
+
+    # Add back minimal lag
+    indices += lag_min
+    # Add 1 empirical calibration offset
+    indices += 1
+
+    return indices
+
+
+def _median_smoothing(indices: Tensor, win_length: int) -> Tensor:
+    r"""
+    Apply median smoothing to the 1D tensor over the given window.
+    """
+
+    # Centered windowed
+    pad_length = (win_length - 1) // 2
+
+    # "replicate" padding in any dimension
+    indices = torch.nn.functional.pad(indices, (pad_length, 0), mode="constant", value=0.0)
+
+    indices[..., :pad_length] = torch.cat(pad_length * [indices[..., pad_length].unsqueeze(-1)], dim=-1)
+    roll = indices.unfold(-1, win_length, 1)
+
+    values, _ = torch.median(roll, -1)
+    return values
+
+
+def detect_pitch_frequency(
+    waveform: Tensor,
+    sample_rate: int,
+    frame_time: float = 10 ** (-2),
+    win_length: int = 30,
+    freq_low: int = 85,
+    freq_high: int = 3400,
+) -> Tensor:
+    r"""Detect pitch frequency.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    It is implemented using normalized cross-correlation function and median smoothing.
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(..., freq, time)`
+        sample_rate (int): The sample rate of the waveform (Hz)
+        frame_time (float, optional): Duration of a frame (Default: ``10 ** (-2)``).
+        win_length (int, optional): The window length for median smoothing (in number of frames) (Default: ``30``).
+        freq_low (int, optional): Lowest frequency that can be detected (Hz) (Default: ``85``).
+        freq_high (int, optional): Highest frequency that can be detected (Hz) (Default: ``3400``).
+
+    Returns:
+        Tensor: Tensor of freq of dimension `(..., frame)`
+    """
+    # pack batch
+    shape = list(waveform.size())
+    waveform = waveform.reshape([-1] + shape[-1:])
+
+    nccf = _compute_nccf(waveform, sample_rate, frame_time, freq_low)
+    indices = _find_max_per_frame(nccf, sample_rate, freq_high)
+    indices = _median_smoothing(indices, win_length)
+
+    # Convert indices to frequency
+    EPSILON = 10 ** (-9)
+    freq = sample_rate / (EPSILON + indices.to(torch.float))
+
+    # unpack batch
+    freq = freq.reshape(shape[:-1] + list(freq.shape[-1:]))
+
+    return freq
+
+
+def sliding_window_cmn(
+    specgram: Tensor,
+    cmn_window: int = 600,
+    min_cmn_window: int = 100,
+    center: bool = False,
+    norm_vars: bool = False,
+) -> Tensor:
+    r"""
+    Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)`
+        cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
+        min_cmn_window (int, optional):  Minimum CMN window used at start of decoding (adds latency only at start).
+            Only applicable if center == false, ignored if center==true (int, default = 100)
+        center (bool, optional): If true, use a window centered on the current frame
+            (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
+        norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false)
+
+    Returns:
+        Tensor: Tensor matching input shape `(..., freq, time)`
+    """
+    input_shape = specgram.shape
+    num_frames, num_feats = input_shape[-2:]
+    specgram = specgram.view(-1, num_frames, num_feats)
+    num_channels = specgram.shape[0]
+
+    dtype = specgram.dtype
+    device = specgram.device
+    last_window_start = last_window_end = -1
+    cur_sum = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
+    cur_sumsq = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
+    cmn_specgram = torch.zeros(num_channels, num_frames, num_feats, dtype=dtype, device=device)
+    for t in range(num_frames):
+        window_start = 0
+        window_end = 0
+        if center:
+            window_start = t - cmn_window // 2
+            window_end = window_start + cmn_window
+        else:
+            window_start = t - cmn_window
+            window_end = t + 1
+        if window_start < 0:
+            window_end -= window_start
+            window_start = 0
+        if not center:
+            if window_end > t:
+                window_end = max(t + 1, min_cmn_window)
+        if window_end > num_frames:
+            window_start -= window_end - num_frames
+            window_end = num_frames
+            if window_start < 0:
+                window_start = 0
+        if last_window_start == -1:
+            input_part = specgram[:, window_start : window_end - window_start, :]
+            cur_sum += torch.sum(input_part, 1)
+            if norm_vars:
+                cur_sumsq += torch.cumsum(input_part**2, 1)[:, -1, :]
+        else:
+            if window_start > last_window_start:
+                frame_to_remove = specgram[:, last_window_start, :]
+                cur_sum -= frame_to_remove
+                if norm_vars:
+                    cur_sumsq -= frame_to_remove**2
+            if window_end > last_window_end:
+                frame_to_add = specgram[:, last_window_end, :]
+                cur_sum += frame_to_add
+                if norm_vars:
+                    cur_sumsq += frame_to_add**2
+        window_frames = window_end - window_start
+        last_window_start = window_start
+        last_window_end = window_end
+        cmn_specgram[:, t, :] = specgram[:, t, :] - cur_sum / window_frames
+        if norm_vars:
+            if window_frames == 1:
+                cmn_specgram[:, t, :] = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
+            else:
+                variance = cur_sumsq
+                variance = variance / window_frames
+                variance -= (cur_sum**2) / (window_frames**2)
+                variance = torch.pow(variance, -0.5)
+                cmn_specgram[:, t, :] *= variance
+
+    cmn_specgram = cmn_specgram.view(input_shape[:-2] + (num_frames, num_feats))
+    if len(input_shape) == 2:
+        cmn_specgram = cmn_specgram.squeeze(0)
+    return cmn_specgram
+
+
+def spectral_centroid(
+    waveform: Tensor,
+    sample_rate: int,
+    pad: int,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+) -> Tensor:
+    r"""Compute the spectral centroid for each channel along the time axis.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The spectral centroid is defined as the weighted average of the
+    frequency values, weighted by their magnitude.
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(..., time)`
+        sample_rate (int): Sample rate of the audio waveform
+        pad (int): Two sided padding of signal
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT
+        hop_length (int): Length of hop between STFT windows
+        win_length (int): Window size
+
+    Returns:
+        Tensor: Dimension `(..., time)`
+    """
+    specgram = spectrogram(
+        waveform,
+        pad=pad,
+        window=window,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        power=1.0,
+        normalized=False,
+    )
+    freqs = torch.linspace(0, sample_rate // 2, steps=1 + n_fft // 2, device=specgram.device).reshape((-1, 1))
+    freq_dim = -2
+    return (freqs * specgram).sum(dim=freq_dim) / specgram.sum(dim=freq_dim)
+
+
+@deprecated("Please migrate to :py:class:`torchaudio.io.AudioEffector`.", remove=False)
+def apply_codec(
+    waveform: Tensor,
+    sample_rate: int,
+    format: str,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+) -> Tensor:
+    r"""
+    Apply codecs as a form of augmentation.
+
+    .. devices:: CPU
+
+    Args:
+        waveform (Tensor): Audio data. Must be 2 dimensional. See also ```channels_first```.
+        sample_rate (int): Sample rate of the audio waveform.
+        format (str): File format.
+        channels_first (bool, optional):
+            When True, both the input and output Tensor have dimension `(channel, time)`.
+            Otherwise, they have dimension `(time, channel)`.
+        compression (float or None, optional): Used for formats other than WAV.
+            For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
+        encoding (str or None, optional): Changes the encoding for the supported formats.
+            For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
+        bits_per_sample (int or None, optional): Changes the bit depth for the supported formats.
+            For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
+
+    Returns:
+        Tensor: Resulting Tensor.
+        If ``channels_first=True``, it has `(channel, time)` else `(time, channel)`.
+    """
+    from torchaudio.backend import _sox_io_backend
+
+    with tempfile.NamedTemporaryFile() as f:
+        torchaudio.backend._sox_io_backend.save(
+            f.name, waveform, sample_rate, channels_first, compression, format, encoding, bits_per_sample
+        )
+        augmented, sr = _sox_io_backend.load(f.name, channels_first=channels_first, format=format)
+    if sr != sample_rate:
+        augmented = resample(augmented, sr, sample_rate)
+    return augmented
+
+
+_CPU = torch.device("cpu")
+
+
+def _get_sinc_resample_kernel(
+    orig_freq: int,
+    new_freq: int,
+    gcd: int,
+    lowpass_filter_width: int = 6,
+    rolloff: float = 0.99,
+    resampling_method: str = "sinc_interp_hann",
+    beta: Optional[float] = None,
+    device: torch.device = _CPU,
+    dtype: Optional[torch.dtype] = None,
+):
+    if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq):
+        raise Exception(
+            "Frequencies must be of integer type to ensure quality resampling computation. "
+            "To work around this, manually convert both frequencies to integer values "
+            "that maintain their resampling rate ratio before passing them into the function. "
+            "Example: To downsample a 44100 hz waveform by a factor of 8, use "
+            "`orig_freq=8` and `new_freq=1` instead of `orig_freq=44100` and `new_freq=5512.5`. "
+            "For more information, please refer to https://github.com/pytorch/audio/issues/1487."
+        )
+
+    if resampling_method in ["sinc_interpolation", "kaiser_window"]:
+        method_map = {
+            "sinc_interpolation": "sinc_interp_hann",
+            "kaiser_window": "sinc_interp_kaiser",
+        }
+        warnings.warn(
+            f'"{resampling_method}" resampling method name is being deprecated and replaced by '
+            f'"{method_map[resampling_method]}" in the next release. '
+            "The default behavior remains unchanged.",
+            stacklevel=3,
+        )
+    elif resampling_method not in ["sinc_interp_hann", "sinc_interp_kaiser"]:
+        raise ValueError("Invalid resampling method: {}".format(resampling_method))
+
+    orig_freq = int(orig_freq) // gcd
+    new_freq = int(new_freq) // gcd
+
+    if lowpass_filter_width <= 0:
+        raise ValueError("Low pass filter width should be positive.")
+    base_freq = min(orig_freq, new_freq)
+    # This will perform antialiasing filtering by removing the highest frequencies.
+    # At first I thought I only needed this when downsampling, but when upsampling
+    # you will get edge artifacts without this, as the edge is equivalent to zero padding,
+    # which will add high freq artifacts.
+    base_freq *= rolloff
+
+    # The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor)
+    # using the sinc interpolation formula:
+    #   x(t) = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - t))
+    # We can then sample the function x(t) with a different sample rate:
+    #    y[j] = x(j / new_freq)
+    # or,
+    #    y[j] = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
+
+    # We see here that y[j] is the convolution of x[i] with a specific filter, for which
+    # we take an FIR approximation, stopping when we see at least `lowpass_filter_width` zeros crossing.
+    # But y[j+1] is going to have a different set of weights and so on, until y[j + new_freq].
+    # Indeed:
+    # y[j + new_freq] = sum_i x[i] sinc(pi * orig_freq * ((i / orig_freq - (j + new_freq) / new_freq))
+    #                 = sum_i x[i] sinc(pi * orig_freq * ((i - orig_freq) / orig_freq - j / new_freq))
+    #                 = sum_i x[i + orig_freq] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
+    # so y[j+new_freq] uses the same filter as y[j], but on a shifted version of x by `orig_freq`.
+    # This will explain the F.conv1d after, with a stride of orig_freq.
+    width = math.ceil(lowpass_filter_width * orig_freq / base_freq)
+    # If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e.,
+    # they will have a lot of almost zero values to the left or to the right...
+    # There is probably a way to evaluate those filters more efficiently, but this is kept for
+    # future work.
+    idx_dtype = dtype if dtype is not None else torch.float64
+
+    idx = torch.arange(-width, width + orig_freq, dtype=idx_dtype, device=device)[None, None] / orig_freq
+
+    t = torch.arange(0, -new_freq, -1, dtype=dtype, device=device)[:, None, None] / new_freq + idx
+    t *= base_freq
+    t = t.clamp_(-lowpass_filter_width, lowpass_filter_width)
+
+    # we do not use built in torch windows here as we need to evaluate the window
+    # at specific positions, not over a regular grid.
+    if resampling_method == "sinc_interp_hann":
+        window = torch.cos(t * math.pi / lowpass_filter_width / 2) ** 2
+    else:
+        # sinc_interp_kaiser
+        if beta is None:
+            beta = 14.769656459379492
+        beta_tensor = torch.tensor(float(beta))
+        window = torch.i0(beta_tensor * torch.sqrt(1 - (t / lowpass_filter_width) ** 2)) / torch.i0(beta_tensor)
+
+    t *= math.pi
+
+    scale = base_freq / orig_freq
+    kernels = torch.where(t == 0, torch.tensor(1.0).to(t), t.sin() / t)
+    kernels *= window * scale
+
+    if dtype is None:
+        kernels = kernels.to(dtype=torch.float32)
+
+    return kernels, width
+
+
+def _apply_sinc_resample_kernel(
+    waveform: Tensor,
+    orig_freq: int,
+    new_freq: int,
+    gcd: int,
+    kernel: Tensor,
+    width: int,
+):
+    if not waveform.is_floating_point():
+        raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.")
+
+    orig_freq = int(orig_freq) // gcd
+    new_freq = int(new_freq) // gcd
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.view(-1, shape[-1])
+
+    num_wavs, length = waveform.shape
+    waveform = torch.nn.functional.pad(waveform, (width, width + orig_freq))
+    resampled = torch.nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq)
+    resampled = resampled.transpose(1, 2).reshape(num_wavs, -1)
+    target_length = torch.ceil(torch.as_tensor(new_freq * length / orig_freq)).long()
+    resampled = resampled[..., :target_length]
+
+    # unpack batch
+    resampled = resampled.view(shape[:-1] + resampled.shape[-1:])
+    return resampled
+
+
+def resample(
+    waveform: Tensor,
+    orig_freq: int,
+    new_freq: int,
+    lowpass_filter_width: int = 6,
+    rolloff: float = 0.99,
+    resampling_method: str = "sinc_interp_hann",
+    beta: Optional[float] = None,
+) -> Tensor:
+    r"""Resamples the waveform at the new frequency using bandlimited interpolation. :cite:`RESAMPLE`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        ``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in
+        more efficient computation if resampling multiple waveforms with the same resampling parameters.
+
+    Args:
+        waveform (Tensor): The input signal of dimension `(..., time)`
+        orig_freq (int): The original frequency of the signal
+        new_freq (int): The desired frequency
+        lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
+            but less efficient. (Default: ``6``)
+        rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
+            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
+        resampling_method (str, optional): The resampling method to use.
+            Options: [``"sinc_interp_hann"``, ``"sinc_interp_kaiser"``] (Default: ``"sinc_interp_hann"``)
+        beta (float or None, optional): The shape parameter used for kaiser window.
+
+    Returns:
+        Tensor: The waveform at the new frequency of dimension `(..., time).`
+    """
+
+    if orig_freq <= 0.0 or new_freq <= 0.0:
+        raise ValueError("Original frequency and desired frequecy should be positive")
+
+    if orig_freq == new_freq:
+        return waveform
+
+    gcd = math.gcd(int(orig_freq), int(new_freq))
+
+    kernel, width = _get_sinc_resample_kernel(
+        orig_freq,
+        new_freq,
+        gcd,
+        lowpass_filter_width,
+        rolloff,
+        resampling_method,
+        beta,
+        waveform.device,
+        waveform.dtype,
+    )
+    resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width)
+    return resampled
+
+
+@torch.jit.unused
+def edit_distance(seq1: Sequence, seq2: Sequence) -> int:
+    """
+    Calculate the word level edit (Levenshtein) distance between two sequences.
+
+    .. devices:: CPU
+
+    The function computes an edit distance allowing deletion, insertion and
+    substitution. The result is an integer.
+
+    For most applications, the two input sequences should be the same type. If
+    two strings are given, the output is the edit distance between the two
+    strings (character edit distance). If two lists of strings are given, the
+    output is the edit distance between sentences (word edit distance). Users
+    may want to normalize the output by the length of the reference sequence.
+
+    Args:
+        seq1 (Sequence): the first sequence to compare.
+        seq2 (Sequence): the second sequence to compare.
+    Returns:
+        int: The distance between the first and second sequences.
+    """
+    len_sent2 = len(seq2)
+    dold = list(range(len_sent2 + 1))
+    dnew = [0 for _ in range(len_sent2 + 1)]
+
+    for i in range(1, len(seq1) + 1):
+        dnew[0] = i
+        for j in range(1, len_sent2 + 1):
+            if seq1[i - 1] == seq2[j - 1]:
+                dnew[j] = dold[j - 1]
+            else:
+                substitution = dold[j - 1] + 1
+                insertion = dnew[j - 1] + 1
+                deletion = dold[j] + 1
+                dnew[j] = min(substitution, insertion, deletion)
+
+        dnew, dold = dold, dnew
+
+    return int(dold[-1])
+
+
+def loudness(waveform: Tensor, sample_rate: int):
+    r"""Measure audio loudness according to the ITU-R BS.1770-4 recommendation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        waveform(torch.Tensor): audio waveform of dimension `(..., channels, time)`
+        sample_rate (int): sampling rate of the waveform
+
+    Returns:
+        Tensor: loudness estimates (LKFS)
+
+    Reference:
+        - https://www.itu.int/rec/R-REC-BS.1770-4-201510-I/en
+    """
+
+    if waveform.size(-2) > 5:
+        raise ValueError("Only up to 5 channels are supported.")
+
+    gate_duration = 0.4
+    overlap = 0.75
+    gamma_abs = -70.0
+    kweight_bias = -0.691
+    gate_samples = int(round(gate_duration * sample_rate))
+    step = int(round(gate_samples * (1 - overlap)))
+
+    # Apply K-weighting
+    waveform = treble_biquad(waveform, sample_rate, 4.0, 1500.0, 1 / math.sqrt(2))
+    waveform = highpass_biquad(waveform, sample_rate, 38.0, 0.5)
+
+    # Compute the energy for each block
+    energy = torch.square(waveform).unfold(-1, gate_samples, step)
+    energy = torch.mean(energy, dim=-1)
+
+    # Compute channel-weighted summation
+    g = torch.tensor([1.0, 1.0, 1.0, 1.41, 1.41], dtype=waveform.dtype, device=waveform.device)
+    g = g[: energy.size(-2)]
+
+    energy_weighted = torch.sum(g.unsqueeze(-1) * energy, dim=-2)
+    loudness = -0.691 + 10 * torch.log10(energy_weighted)
+
+    # Apply absolute gating of the blocks
+    gated_blocks = loudness > gamma_abs
+    gated_blocks = gated_blocks.unsqueeze(-2)
+
+    energy_filtered = torch.sum(gated_blocks * energy, dim=-1) / torch.count_nonzero(gated_blocks, dim=-1)
+    energy_weighted = torch.sum(g * energy_filtered, dim=-1)
+    gamma_rel = kweight_bias + 10 * torch.log10(energy_weighted) - 10
+
+    # Apply relative gating of the blocks
+    gated_blocks = torch.logical_and(gated_blocks.squeeze(-2), loudness > gamma_rel.unsqueeze(-1))
+    gated_blocks = gated_blocks.unsqueeze(-2)
+
+    energy_filtered = torch.sum(gated_blocks * energy, dim=-1) / torch.count_nonzero(gated_blocks, dim=-1)
+    energy_weighted = torch.sum(g * energy_filtered, dim=-1)
+    LKFS = kweight_bias + 10 * torch.log10(energy_weighted)
+    return LKFS
+
+
+def pitch_shift(
+    waveform: Tensor,
+    sample_rate: int,
+    n_steps: int,
+    bins_per_octave: int = 12,
+    n_fft: int = 512,
+    win_length: Optional[int] = None,
+    hop_length: Optional[int] = None,
+    window: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Shift the pitch of a waveform by ``n_steps`` steps.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        waveform (Tensor): The input waveform of shape `(..., time)`.
+        sample_rate (int): Sample rate of `waveform`.
+        n_steps (int): The (fractional) steps to shift `waveform`.
+        bins_per_octave (int, optional): The number of steps per octave (Default: ``12``).
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins (Default: ``512``).
+        win_length (int or None, optional): Window size. If None, then ``n_fft`` is used. (Default: ``None``).
+        hop_length (int or None, optional): Length of hop between STFT windows. If None, then
+            ``win_length // 4`` is used (Default: ``None``).
+        window (Tensor or None, optional): Window tensor that is applied/multiplied to each frame/window.
+            If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``).
+
+
+    Returns:
+        Tensor: The pitch-shifted audio waveform of shape `(..., time)`.
+    """
+    waveform_stretch = _stretch_waveform(
+        waveform,
+        n_steps,
+        bins_per_octave,
+        n_fft,
+        win_length,
+        hop_length,
+        window,
+    )
+    rate = 2.0 ** (-float(n_steps) / bins_per_octave)
+    waveform_shift = resample(waveform_stretch, int(sample_rate / rate), sample_rate)
+
+    return _fix_waveform_shape(waveform_shift, waveform.size())
+
+
+def _stretch_waveform(
+    waveform: Tensor,
+    n_steps: int,
+    bins_per_octave: int = 12,
+    n_fft: int = 512,
+    win_length: Optional[int] = None,
+    hop_length: Optional[int] = None,
+    window: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Pitch shift helper function to preprocess and stretch waveform before resampling step.
+
+    Args:
+        See pitch_shift arg descriptions.
+
+    Returns:
+        Tensor: The preprocessed waveform stretched prior to resampling.
+    """
+    if hop_length is None:
+        hop_length = n_fft // 4
+    if win_length is None:
+        win_length = n_fft
+    if window is None:
+        window = torch.hann_window(window_length=win_length, device=waveform.device)
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+
+    ori_len = shape[-1]
+    rate = 2.0 ** (-float(n_steps) / bins_per_octave)
+    spec_f = torch.stft(
+        input=waveform,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        center=True,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    phase_advance = torch.linspace(0, math.pi * hop_length, spec_f.shape[-2], device=spec_f.device)[..., None]
+    spec_stretch = phase_vocoder(spec_f, rate, phase_advance)
+    len_stretch = int(round(ori_len / rate))
+    waveform_stretch = torch.istft(
+        spec_stretch, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=len_stretch
+    )
+    return waveform_stretch
+
+
+def _fix_waveform_shape(
+    waveform_shift: Tensor,
+    shape: List[int],
+) -> Tensor:
+    """
+    PitchShift helper function to process after resampling step to fix the shape back.
+
+    Args:
+        waveform_shift(Tensor): The waveform after stretch and resample
+        shape (List[int]): The shape of initial waveform
+
+    Returns:
+        Tensor: The pitch-shifted audio waveform of shape `(..., time)`.
+    """
+    ori_len = shape[-1]
+    shift_len = waveform_shift.size()[-1]
+    if shift_len > ori_len:
+        waveform_shift = waveform_shift[..., :ori_len]
+    else:
+        waveform_shift = torch.nn.functional.pad(waveform_shift, [0, ori_len - shift_len])
+
+    # unpack batch
+    waveform_shift = waveform_shift.view(shape[:-1] + waveform_shift.shape[-1:])
+    return waveform_shift
+
+
+def rnnt_loss(
+    logits: Tensor,
+    targets: Tensor,
+    logit_lengths: Tensor,
+    target_lengths: Tensor,
+    blank: int = -1,
+    clamp: float = -1,
+    reduction: str = "mean",
+    fused_log_softmax: bool = True,
+):
+    """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
+    :cite:`graves2012sequence`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The RNN Transducer loss extends the CTC loss by defining a distribution over output
+    sequences of all lengths, and by jointly modelling both input-output and output-output
+    dependencies.
+
+    Args:
+        logits (Tensor): Tensor of dimension `(batch, max seq length, max target length + 1, class)`
+            containing output from joiner
+        targets (Tensor): Tensor of dimension `(batch, max target length)` containing targets with zero padded
+        logit_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of each sequence from encoder
+        target_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of targets for each sequence
+        blank (int, optional): blank label (Default: ``-1``)
+        clamp (float, optional): clamp for gradients (Default: ``-1``)
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``"none"`` | ``"mean"`` | ``"sum"``. (Default: ``"mean"``)
+        fused_log_softmax (bool): set to False if calling log_softmax outside of loss (Default: ``True``)
+    Returns:
+        Tensor: Loss with the reduction option applied. If ``reduction`` is  ``"none"``, then size `(batch)`,
+        otherwise scalar.
+    """
+    if reduction not in ["none", "mean", "sum"]:
+        raise ValueError('reduction should be one of "none", "mean", or "sum"')
+
+    if blank < 0:  # reinterpret blank index if blank < 0.
+        blank = logits.shape[-1] + blank
+
+    costs, _ = torch.ops.torchaudio.rnnt_loss(
+        logits=logits,
+        targets=targets,
+        logit_lengths=logit_lengths,
+        target_lengths=target_lengths,
+        blank=blank,
+        clamp=clamp,
+        fused_log_softmax=fused_log_softmax,
+    )
+
+    if reduction == "mean":
+        return costs.mean()
+    elif reduction == "sum":
+        return costs.sum()
+
+    return costs
+
+
+def psd(
+    specgram: Tensor,
+    mask: Optional[Tensor] = None,
+    normalize: bool = True,
+    eps: float = 1e-10,
+) -> Tensor:
+    """Compute cross-channel power spectral density (PSD) matrix.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+            Tensor with dimensions `(..., channel, freq, time)`.
+        mask (torch.Tensor or None, optional): Time-Frequency mask for normalization.
+            Tensor with dimensions `(..., freq, time)`. (Default: ``None``)
+        normalize (bool, optional): If ``True``, normalize the mask along the time dimension. (Default: ``True``)
+        eps (float, optional): Value to add to the denominator in mask normalization. (Default: ``1e-15``)
+
+    Returns:
+        torch.Tensor: The complex-valued PSD matrix of the input spectrum.
+        Tensor with dimensions `(..., freq, channel, channel)`
+    """
+    specgram = specgram.transpose(-3, -2)  # shape (freq, channel, time)
+    # outer product:
+    # (..., ch_1, time) x (..., ch_2, time) -> (..., time, ch_1, ch_2)
+    psd = torch.einsum("...ct,...et->...tce", [specgram, specgram.conj()])
+
+    if mask is not None:
+        if mask.shape[:-1] != specgram.shape[:-2] or mask.shape[-1] != specgram.shape[-1]:
+            raise ValueError(
+                "The dimensions of mask except the channel dimension should be the same as specgram."
+                f"Found {mask.shape} for mask and {specgram.shape} for specgram."
+            )
+        # Normalized mask along time dimension:
+        if normalize:
+            mask = mask / (mask.sum(dim=-1, keepdim=True) + eps)
+
+        psd = psd * mask[..., None, None]
+
+    psd = psd.sum(dim=-3)
+    return psd
+
+
+def _compute_mat_trace(input: torch.Tensor, dim1: int = -1, dim2: int = -2) -> torch.Tensor:
+    r"""Compute the trace of a Tensor along ``dim1`` and ``dim2`` dimensions.
+
+    Args:
+        input (torch.Tensor): Tensor with dimensions `(..., channel, channel)`.
+        dim1 (int, optional): The first dimension of the diagonal matrix.
+            (Default: ``-1``)
+        dim2 (int, optional): The second dimension of the diagonal matrix.
+            (Default: ``-2``)
+
+    Returns:
+        Tensor: The trace of the input Tensor.
+    """
+    if input.ndim < 2:
+        raise ValueError("The dimension of the tensor must be at least 2.")
+    if input.shape[dim1] != input.shape[dim2]:
+        raise ValueError("The size of ``dim1`` and ``dim2`` must be the same.")
+    input = torch.diagonal(input, 0, dim1=dim1, dim2=dim2)
+    return input.sum(dim=-1)
+
+
+def _tik_reg(mat: torch.Tensor, reg: float = 1e-7, eps: float = 1e-8) -> torch.Tensor:
+    """Perform Tikhonov regularization (only modifying real part).
+
+    Args:
+        mat (torch.Tensor): Input matrix with dimensions `(..., channel, channel)`.
+        reg (float, optional): Regularization factor. (Default: 1e-8)
+        eps (float, optional): Value to avoid the correlation matrix is all-zero. (Default: ``1e-8``)
+
+    Returns:
+        Tensor: Regularized matrix with dimensions `(..., channel, channel)`.
+    """
+    # Add eps
+    C = mat.size(-1)
+    eye = torch.eye(C, dtype=mat.dtype, device=mat.device)
+    epsilon = _compute_mat_trace(mat).real[..., None, None] * reg
+    # in case that correlation_matrix is all-zero
+    epsilon = epsilon + eps
+    mat = mat + epsilon * eye[..., :, :]
+    return mat
+
+
+def _assert_psd_matrices(psd_s: torch.Tensor, psd_n: torch.Tensor) -> None:
+    """Assertion checks of the PSD matrices of target speech and noise.
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+    """
+    if psd_s.ndim < 3 or psd_n.ndim < 3:
+        raise ValueError(
+            "Expected at least 3D Tensor (..., freq, channel, channel) for psd_s and psd_n. "
+            f"Found {psd_s.shape} for psd_s and {psd_n.shape} for psd_n."
+        )
+    if not (psd_s.is_complex() and psd_n.is_complex()):
+        raise TypeError(
+            "The type of psd_s and psd_n must be ``torch.cfloat`` or ``torch.cdouble``. "
+            f"Found {psd_s.dtype} for psd_s and {psd_n.dtype} for psd_n."
+        )
+    if psd_s.shape != psd_n.shape:
+        raise ValueError(
+            f"The dimensions of psd_s and psd_n should be the same. Found {psd_s.shape} and {psd_n.shape}."
+        )
+    if psd_s.shape[-1] != psd_s.shape[-2]:
+        raise ValueError(f"The last two dimensions of psd_s should be the same. Found {psd_s.shape}.")
+
+
+def mvdr_weights_souden(
+    psd_s: Tensor,
+    psd_n: Tensor,
+    reference_channel: Union[int, Tensor],
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> Tensor:
+    r"""Compute the Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) beamforming weights
+    by the method proposed by *Souden et, al.* :cite:`souden2009optimal`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given the power spectral density (PSD) matrix of target speech :math:`\bf{\Phi}_{\textbf{SS}}`,
+    the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and a one-hot vector that represents the
+    reference channel :math:`\bf{u}`, the method computes the MVDR beamforming weight martrix
+    :math:`\textbf{w}_{\text{MVDR}}`. The formula is defined as:
+
+    .. math::
+        \textbf{w}_{\text{MVDR}}(f) =
+        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}
+        {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u}
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_channel (int or torch.Tensor): Specifies the reference channel.
+            If the dtype is ``int``, it represents the reference channel index.
+            If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+            is one-hot.
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+        eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+            (Default: ``1e-8``)
+
+    Returns:
+        torch.Tensor: The complex-valued MVDR beamforming weight matrix with dimensions `(..., freq, channel)`.
+    """
+    _assert_psd_matrices(psd_s, psd_n)
+
+    if diagonal_loading:
+        psd_n = _tik_reg(psd_n, reg=diag_eps)
+    numerator = torch.linalg.solve(psd_n, psd_s)  # psd_n.inv() @ psd_s
+    # ws: (..., C, C) / (...,) -> (..., C, C)
+    ws = numerator / (_compute_mat_trace(numerator)[..., None, None] + eps)
+    if torch.jit.isinstance(reference_channel, int):
+        beamform_weights = ws[..., :, reference_channel]
+    elif torch.jit.isinstance(reference_channel, Tensor):
+        reference_channel = reference_channel.to(psd_n.dtype)
+        # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
+        beamform_weights = torch.einsum("...c,...c->...", [ws, reference_channel[..., None, None, :]])
+    else:
+        raise TypeError(f'Expected "int" or "Tensor" for reference_channel. Found: {type(reference_channel)}.')
+
+    return beamform_weights
+
+
+def mvdr_weights_rtf(
+    rtf: Tensor,
+    psd_n: Tensor,
+    reference_channel: Optional[Union[int, Tensor]] = None,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> Tensor:
+    r"""Compute the Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) beamforming weights
+    based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given the relative transfer function (RTF) matrix or the steering vector of target speech :math:`\bm{v}`,
+    the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and a one-hot vector that represents the
+    reference channel :math:`\bf{u}`, the method computes the MVDR beamforming weight martrix
+    :math:`\textbf{w}_{\text{MVDR}}`. The formula is defined as:
+
+    .. math::
+        \textbf{w}_{\text{MVDR}}(f) =
+        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}
+        {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}
+
+    where :math:`(.)^{\mathsf{H}}` denotes the Hermitian Conjugate operation.
+
+    Args:
+        rtf (torch.Tensor): The complex-valued RTF vector of target speech.
+            Tensor with dimensions `(..., freq, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_channel (int or torch.Tensor): Specifies the reference channel.
+            If the dtype is ``int``, it represents the reference channel index.
+            If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+            is one-hot.
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+        eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+            (Default: ``1e-8``)
+
+    Returns:
+        torch.Tensor: The complex-valued MVDR beamforming weight matrix with dimensions `(..., freq, channel)`.
+    """
+    if rtf.ndim < 2:
+        raise ValueError(f"Expected at least 2D Tensor (..., freq, channel) for rtf. Found {rtf.shape}.")
+    if psd_n.ndim < 3:
+        raise ValueError(f"Expected at least 3D Tensor (..., freq, channel, channel) for psd_n. Found {psd_n.shape}.")
+    if not (rtf.is_complex() and psd_n.is_complex()):
+        raise TypeError(
+            "The type of rtf and psd_n must be ``torch.cfloat`` or ``torch.cdouble``. "
+            f"Found {rtf.dtype} for rtf and {psd_n.dtype} for psd_n."
+        )
+    if rtf.shape != psd_n.shape[:-1]:
+        raise ValueError(
+            "The dimensions of rtf and the dimensions withou the last dimension of psd_n should be the same. "
+            f"Found {rtf.shape} for rtf and {psd_n.shape} for psd_n."
+        )
+    if psd_n.shape[-1] != psd_n.shape[-2]:
+        raise ValueError(f"The last two dimensions of psd_n should be the same. Found {psd_n.shape}.")
+
+    if diagonal_loading:
+        psd_n = _tik_reg(psd_n, reg=diag_eps)
+    # numerator = psd_n.inv() @ stv
+    numerator = torch.linalg.solve(psd_n, rtf.unsqueeze(-1)).squeeze(-1)  # (..., freq, channel)
+    # denominator = stv^H @ psd_n.inv() @ stv
+    denominator = torch.einsum("...d,...d->...", [rtf.conj(), numerator])
+    beamform_weights = numerator / (denominator.real.unsqueeze(-1) + eps)
+    # normalize the numerator
+    if reference_channel is not None:
+        if torch.jit.isinstance(reference_channel, int):
+            scale = rtf[..., reference_channel].conj()
+        elif torch.jit.isinstance(reference_channel, Tensor):
+            reference_channel = reference_channel.to(psd_n.dtype)
+            scale = torch.einsum("...c,...c->...", [rtf.conj(), reference_channel[..., None, :]])
+        else:
+            raise TypeError(f'Expected "int" or "Tensor" for reference_channel. Found: {type(reference_channel)}.')
+
+        beamform_weights = beamform_weights * scale[..., None]
+
+    return beamform_weights
+
+
+def rtf_evd(psd_s: Tensor) -> Tensor:
+    r"""Estimate the relative transfer function (RTF) or the steering vector by eigenvalue decomposition.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        psd_s (Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor of dimension `(..., freq, channel, channel)`
+
+    Returns:
+        Tensor: The estimated complex-valued RTF of target speech.
+        Tensor of dimension `(..., freq, channel)`
+    """
+    if not psd_s.is_complex():
+        raise TypeError(f"The type of psd_s must be ``torch.cfloat`` or ``torch.cdouble``. Found {psd_s.dtype}.")
+    if psd_s.shape[-1] != psd_s.shape[-2]:
+        raise ValueError(f"The last two dimensions of psd_s should be the same. Found {psd_s.shape}.")
+    _, v = torch.linalg.eigh(psd_s)  # v is sorted along with eigenvalues in ascending order
+    rtf = v[..., -1]  # choose the eigenvector with max eigenvalue
+    return rtf
+
+
+def rtf_power(
+    psd_s: Tensor,
+    psd_n: Tensor,
+    reference_channel: Union[int, Tensor],
+    n_iter: int = 3,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+) -> Tensor:
+    r"""Estimate the relative transfer function (RTF) or the steering vector by the power method.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_channel (int or torch.Tensor): Specifies the reference channel.
+            If the dtype is ``int``, it represents the reference channel index.
+            If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+            is one-hot.
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+
+    Returns:
+        torch.Tensor: The estimated complex-valued RTF of target speech.
+        Tensor of dimension `(..., freq, channel)`.
+    """
+    _assert_psd_matrices(psd_s, psd_n)
+    if n_iter <= 0:
+        raise ValueError("The number of iteration must be greater than 0.")
+
+    # Apply diagonal loading to psd_n to improve robustness.
+    if diagonal_loading:
+        psd_n = _tik_reg(psd_n, reg=diag_eps)
+    # phi is regarded as the first iteration
+    phi = torch.linalg.solve(psd_n, psd_s)  # psd_n.inv() @ psd_s
+    if torch.jit.isinstance(reference_channel, int):
+        rtf = phi[..., reference_channel]
+    elif torch.jit.isinstance(reference_channel, Tensor):
+        reference_channel = reference_channel.to(psd_n.dtype)
+        rtf = torch.einsum("...c,...c->...", [phi, reference_channel[..., None, None, :]])
+    else:
+        raise TypeError(f'Expected "int" or "Tensor" for reference_channel. Found: {type(reference_channel)}.')
+    rtf = rtf.unsqueeze(-1)  # (..., freq, channel, 1)
+    if n_iter >= 2:
+        # The number of iterations in the for loop is `n_iter - 2`
+        # because the `phi` above and `torch.matmul(psd_s, rtf)` are regarded as
+        # two iterations.
+        for _ in range(n_iter - 2):
+            rtf = torch.matmul(phi, rtf)
+        rtf = torch.matmul(psd_s, rtf)
+    else:
+        # if there is only one iteration, the rtf is the psd_s[..., referenc_channel]
+        # which is psd_n @ phi @ ref_channel
+        rtf = torch.matmul(psd_n, rtf)
+    return rtf.squeeze(-1)
+
+
+def apply_beamforming(beamform_weights: Tensor, specgram: Tensor) -> Tensor:
+    r"""Apply the beamforming weight to the multi-channel noisy spectrum to obtain the single-channel enhanced spectrum.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    .. math::
+        \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)
+
+    where :math:`\textbf{w}_{\text{bf}}(f)` is the beamforming weight for the :math:`f`-th frequency bin,
+    :math:`\textbf{Y}` is the multi-channel spectrum for the :math:`f`-th frequency bin.
+
+    Args:
+        beamform_weights (Tensor): The complex-valued beamforming weight matrix.
+            Tensor of dimension `(..., freq, channel)`
+        specgram (Tensor): The multi-channel complex-valued noisy spectrum.
+            Tensor of dimension `(..., channel, freq, time)`
+
+    Returns:
+        Tensor: The single-channel complex-valued enhanced spectrum.
+            Tensor of dimension `(..., freq, time)`
+    """
+    if beamform_weights.shape[:-2] != specgram.shape[:-3]:
+        raise ValueError(
+            "The dimensions except the last two dimensions of beamform_weights should be the same "
+            "as the dimensions except the last three dimensions of specgram. "
+            f"Found {beamform_weights.shape} for beamform_weights and {specgram.shape} for specgram."
+        )
+
+    if not (beamform_weights.is_complex() and specgram.is_complex()):
+        raise TypeError(
+            "The type of beamform_weights and specgram must be ``torch.cfloat`` or ``torch.cdouble``. "
+            f"Found {beamform_weights.dtype} for beamform_weights and {specgram.dtype} for specgram."
+        )
+
+    # (..., freq, channel) x (..., channel, freq, time) -> (..., freq, time)
+    specgram_enhanced = torch.einsum("...fc,...cft->...ft", [beamform_weights.conj(), specgram])
+    return specgram_enhanced
+
+
+def _check_shape_compatible(x: torch.Tensor, y: torch.Tensor) -> None:
+    if x.ndim != y.ndim:
+        raise ValueError(f"The operands must be the same dimension (got {x.ndim} and {y.ndim}).")
+
+    for i in range(x.ndim - 1):
+        xi = x.size(i)
+        yi = y.size(i)
+        if xi == yi or xi == 1 or yi == 1:
+            continue
+        raise ValueError(f"Leading dimensions of x and y are not broadcastable (got {x.shape} and {y.shape}).")
+
+
+def _check_convolve_mode(mode: str) -> None:
+    valid_convolve_modes = ["full", "valid", "same"]
+    if mode not in valid_convolve_modes:
+        raise ValueError(f"Unrecognized mode value '{mode}'. Please specify one of {valid_convolve_modes}.")
+
+
+def _apply_convolve_mode(conv_result: torch.Tensor, x_length: int, y_length: int, mode: str) -> torch.Tensor:
+    valid_convolve_modes = ["full", "valid", "same"]
+    if mode == "full":
+        return conv_result
+    elif mode == "valid":
+        target_length = max(x_length, y_length) - min(x_length, y_length) + 1
+        start_idx = (conv_result.size(-1) - target_length) // 2
+        return conv_result[..., start_idx : start_idx + target_length]
+    elif mode == "same":
+        start_idx = (conv_result.size(-1) - x_length) // 2
+        return conv_result[..., start_idx : start_idx + x_length]
+    else:
+        raise ValueError(f"Unrecognized mode value '{mode}'. Please specify one of {valid_convolve_modes}.")
+
+
+def fftconvolve(x: torch.Tensor, y: torch.Tensor, mode: str = "full") -> torch.Tensor:
+    r"""
+    Convolves inputs along their last dimension using FFT. For inputs with large last dimensions, this function
+    is generally much faster than :meth:`convolve`.
+    Note that, in contrast to :meth:`torch.nn.functional.conv1d`, which actually applies the valid cross-correlation
+    operator, this function applies the true `convolution`_ operator.
+    Also note that this function can only output float tensors (int tensor inputs will be cast to float).
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        x (torch.Tensor): First convolution operand, with shape `(..., N)`.
+        y (torch.Tensor): Second convolution operand, with shape `(..., M)`
+            (leading dimensions must be broadcast-able with those of ``x``).
+        mode (str, optional): Must be one of ("full", "valid", "same").
+
+            * "full": Returns the full convolution result, with shape `(..., N + M - 1)`. (Default)
+            * "valid": Returns the segment of the full convolution result corresponding to where
+              the two inputs overlap completely, with shape `(..., max(N, M) - min(N, M) + 1)`.
+            * "same": Returns the center segment of the full convolution result, with shape `(..., N)`.
+
+    Returns:
+        torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., L)`, where
+        the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
+
+    .. _convolution:
+        https://en.wikipedia.org/wiki/Convolution
+    """
+    _check_shape_compatible(x, y)
+    _check_convolve_mode(mode)
+
+    n = x.size(-1) + y.size(-1) - 1
+    fresult = torch.fft.rfft(x, n=n) * torch.fft.rfft(y, n=n)
+    result = torch.fft.irfft(fresult, n=n)
+    return _apply_convolve_mode(result, x.size(-1), y.size(-1), mode)
+
+
+def convolve(x: torch.Tensor, y: torch.Tensor, mode: str = "full") -> torch.Tensor:
+    r"""
+    Convolves inputs along their last dimension using the direct method.
+    Note that, in contrast to :meth:`torch.nn.functional.conv1d`, which actually applies the valid cross-correlation
+    operator, this function applies the true `convolution`_ operator.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        x (torch.Tensor): First convolution operand, with shape `(..., N)`.
+        y (torch.Tensor): Second convolution operand, with shape `(..., M)`
+            (leading dimensions must be broadcast-able with those of ``x``).
+        mode (str, optional): Must be one of ("full", "valid", "same").
+
+            * "full": Returns the full convolution result, with shape `(..., N + M - 1)`. (Default)
+            * "valid": Returns the segment of the full convolution result corresponding to where
+              the two inputs overlap completely, with shape `(..., max(N, M) - min(N, M) + 1)`.
+            * "same": Returns the center segment of the full convolution result, with shape `(..., N)`.
+
+    Returns:
+        torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., L)`, where
+        the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
+
+    .. _convolution:
+        https://en.wikipedia.org/wiki/Convolution
+    """
+    _check_shape_compatible(x, y)
+    _check_convolve_mode(mode)
+
+    x_size, y_size = x.size(-1), y.size(-1)
+
+    if x.size(-1) < y.size(-1):
+        x, y = y, x
+
+    if x.shape[:-1] != y.shape[:-1]:
+        new_shape = [max(i, j) for i, j in zip(x.shape[:-1], y.shape[:-1])]
+        x = x.broadcast_to(new_shape + [x.shape[-1]])
+        y = y.broadcast_to(new_shape + [y.shape[-1]])
+
+    num_signals = torch.tensor(x.shape[:-1]).prod()
+    reshaped_x = x.reshape((int(num_signals), x.size(-1)))
+    reshaped_y = y.reshape((int(num_signals), y.size(-1)))
+    output = torch.nn.functional.conv1d(
+        input=reshaped_x,
+        weight=reshaped_y.flip(-1).unsqueeze(1),
+        stride=1,
+        groups=reshaped_x.size(0),
+        padding=reshaped_y.size(-1) - 1,
+    )
+    output_shape = x.shape[:-1] + (-1,)
+    result = output.reshape(output_shape)
+    return _apply_convolve_mode(result, x_size, y_size, mode)
+
+
+def add_noise(
+    waveform: torch.Tensor, noise: torch.Tensor, snr: torch.Tensor, lengths: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+    r"""Scales and adds noise to waveform per signal-to-noise ratio.
+
+    Specifically, for each pair of waveform vector :math:`x \in \mathbb{R}^L` and noise vector
+    :math:`n \in \mathbb{R}^L`, the function computes output :math:`y` as
+
+    .. math::
+        y = x + a n \, \text{,}
+
+    where
+
+    .. math::
+        a = \sqrt{ \frac{ ||x||_{2}^{2} }{ ||n||_{2}^{2} } \cdot 10^{-\frac{\text{SNR}}{10}} } \, \text{,}
+
+    with :math:`\text{SNR}` being the desired signal-to-noise ratio between :math:`x` and :math:`n`, in dB.
+
+    Note that this function broadcasts singleton leading dimensions in its inputs in a manner that is
+    consistent with the above formulae and PyTorch's broadcasting semantics.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): Input waveform, with shape `(..., L)`.
+        noise (torch.Tensor): Noise, with shape `(..., L)` (same shape as ``waveform``).
+        snr (torch.Tensor): Signal-to-noise ratios in dB, with shape `(...,)`.
+        lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform`` and ``noise``, with shape
+            `(...,)` (leading dimensions must match those of ``waveform``). If ``None``, all elements in ``waveform``
+            and ``noise`` are treated as valid. (Default: ``None``)
+
+    Returns:
+        torch.Tensor: Result of scaling and adding ``noise`` to ``waveform``, with shape `(..., L)`
+        (same shape as ``waveform``).
+    """
+
+    if not (waveform.ndim - 1 == noise.ndim - 1 == snr.ndim and (lengths is None or lengths.ndim == snr.ndim)):
+        raise ValueError("Input leading dimensions don't match.")
+
+    L = waveform.size(-1)
+
+    if L != noise.size(-1):
+        raise ValueError(f"Length dimensions of waveform and noise don't match (got {L} and {noise.size(-1)}).")
+
+    # compute scale
+    if lengths is not None:
+        mask = torch.arange(0, L, device=lengths.device).expand(waveform.shape) < lengths.unsqueeze(
+            -1
+        )  # (*, L) < (*, 1) = (*, L)
+        masked_waveform = waveform * mask
+        masked_noise = noise * mask
+    else:
+        masked_waveform = waveform
+        masked_noise = noise
+
+    energy_signal = torch.linalg.vector_norm(masked_waveform, ord=2, dim=-1) ** 2  # (*,)
+    energy_noise = torch.linalg.vector_norm(masked_noise, ord=2, dim=-1) ** 2  # (*,)
+    original_snr_db = 10 * (torch.log10(energy_signal) - torch.log10(energy_noise))
+    scale = 10 ** ((original_snr_db - snr) / 20.0)  # (*,)
+
+    # scale noise
+    scaled_noise = scale.unsqueeze(-1) * noise  # (*, 1) * (*, L) = (*, L)
+
+    return waveform + scaled_noise  # (*, L)
+
+
+def speed(
+    waveform: torch.Tensor, orig_freq: int, factor: float, lengths: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    r"""Adjusts waveform speed.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): Input signals, with shape `(..., time)`.
+        orig_freq (int): Original frequency of the signals in ``waveform``.
+        factor (float): Factor by which to adjust speed of input. Values greater than 1.0
+            compress ``waveform`` in time, whereas values less than 1.0 stretch ``waveform`` in time.
+        lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform``, with shape `(...)`.
+            If ``None``, all elements in ``waveform`` are treated as valid. (Default: ``None``)
+
+    Returns:
+        (torch.Tensor, torch.Tensor or None):
+            torch.Tensor
+                Speed-adjusted waveform, with shape `(..., new_time).`
+            torch.Tensor or None
+                If ``lengths`` is not ``None``, valid lengths of signals in speed-adjusted waveform,
+                with shape `(...)`; otherwise, ``None``.
+    """
+
+    source_sample_rate = int(factor * orig_freq)
+    target_sample_rate = int(orig_freq)
+
+    gcd = math.gcd(source_sample_rate, target_sample_rate)
+    source_sample_rate = source_sample_rate // gcd
+    target_sample_rate = target_sample_rate // gcd
+
+    if lengths is None:
+        out_lengths = None
+    else:
+        out_lengths = torch.ceil(lengths * target_sample_rate / source_sample_rate).to(lengths.dtype)
+
+    return resample(waveform, source_sample_rate, target_sample_rate), out_lengths
+
+
+def preemphasis(waveform, coeff: float = 0.97) -> torch.Tensor:
+    r"""Pre-emphasizes a waveform along its last dimension, i.e.
+    for each signal :math:`x` in ``waveform``, computes
+    output :math:`y` as
+
+    .. math::
+        y[i] = x[i] - \text{coeff} \cdot x[i - 1]
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): Waveform, with shape `(..., N)`.
+        coeff (float, optional): Pre-emphasis coefficient. Typically between 0.0 and 1.0.
+            (Default: 0.97)
+
+    Returns:
+        torch.Tensor: Pre-emphasized waveform, with shape `(..., N)`.
+    """
+    waveform = waveform.clone()
+    waveform[..., 1:] -= coeff * waveform[..., :-1]
+    return waveform
+
+
+def deemphasis(waveform, coeff: float = 0.97) -> torch.Tensor:
+    r"""De-emphasizes a waveform along its last dimension.
+    Inverse of :meth:`preemphasis`. Concretely, for each signal
+    :math:`x` in ``waveform``, computes output :math:`y` as
+
+    .. math::
+        y[i] = x[i] + \text{coeff} \cdot y[i - 1]
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): Waveform, with shape `(..., N)`.
+        coeff (float, optional): De-emphasis coefficient. Typically between 0.0 and 1.0.
+            (Default: 0.97)
+
+    Returns:
+        torch.Tensor: De-emphasized waveform, with shape `(..., N)`.
+    """
+    a_coeffs = torch.tensor([1.0, -coeff], dtype=waveform.dtype, device=waveform.device)
+    b_coeffs = torch.tensor([1.0, 0.0], dtype=waveform.dtype, device=waveform.device)
+    return torchaudio.functional.lfilter(waveform, a_coeffs=a_coeffs, b_coeffs=b_coeffs)
+
+
+def frechet_distance(mu_x, sigma_x, mu_y, sigma_y):
+    r"""Computes the Fréchet distance between two multivariate normal distributions :cite:`dowson1982frechet`.
+
+    Concretely, for multivariate Gaussians :math:`X(\mu_X, \Sigma_X)`
+    and :math:`Y(\mu_Y, \Sigma_Y)`, the function computes and returns :math:`F` as
+
+    .. math::
+        F(X, Y) = || \mu_X - \mu_Y ||_2^2
+        + \text{Tr}\left( \Sigma_X + \Sigma_Y - 2 \sqrt{\Sigma_X \Sigma_Y} \right)
+
+    Args:
+        mu_x (torch.Tensor): mean :math:`\mu_X` of multivariate Gaussian :math:`X`, with shape `(N,)`.
+        sigma_x (torch.Tensor): covariance matrix :math:`\Sigma_X` of :math:`X`, with shape `(N, N)`.
+        mu_y (torch.Tensor): mean :math:`\mu_Y` of multivariate Gaussian :math:`Y`, with shape `(N,)`.
+        sigma_y (torch.Tensor): covariance matrix :math:`\Sigma_Y` of :math:`Y`, with shape `(N, N)`.
+
+    Returns:
+        torch.Tensor: the Fréchet distance between :math:`X` and :math:`Y`.
+    """
+    if len(mu_x.size()) != 1:
+        raise ValueError(f"Input mu_x must be one-dimensional; got dimension {len(mu_x.size())}.")
+    if len(sigma_x.size()) != 2:
+        raise ValueError(f"Input sigma_x must be two-dimensional; got dimension {len(sigma_x.size())}.")
+    if sigma_x.size(0) != sigma_x.size(1) != mu_x.size(0):
+        raise ValueError("Each of sigma_x's dimensions must match mu_x's size.")
+    if mu_x.size() != mu_y.size():
+        raise ValueError(f"Inputs mu_x and mu_y must have the same shape; got {mu_x.size()} and {mu_y.size()}.")
+    if sigma_x.size() != sigma_y.size():
+        raise ValueError(
+            f"Inputs sigma_x and sigma_y must have the same shape; got {sigma_x.size()} and {sigma_y.size()}."
+        )
+
+    a = (mu_x - mu_y).square().sum()
+    b = sigma_x.trace() + sigma_y.trace()
+    c = torch.linalg.eigvals(sigma_x @ sigma_y).sqrt().real.sum()
+    return a + b - 2 * c
diff --git a/MLPY/Lib/site-packages/torchaudio/io/__init__.py b/MLPY/Lib/site-packages/torchaudio/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..63a3bf08b7cde7cd548878f49db339f07c0227e9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/io/__init__.py
@@ -0,0 +1,13 @@
+from torio.io import CodecConfig, StreamingMediaDecoder as StreamReader, StreamingMediaEncoder as StreamWriter
+
+from ._effector import AudioEffector
+from ._playback import play_audio
+
+
+__all__ = [
+    "AudioEffector",
+    "StreamReader",
+    "StreamWriter",
+    "CodecConfig",
+    "play_audio",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/io/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/io/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53ddc2308505f974d5db09d9ada7c32f5c65c71f
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/io/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/io/__pycache__/_effector.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/io/__pycache__/_effector.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c0f65bd21fb35e50dcc90016a215ab9a9cb8699
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/io/__pycache__/_effector.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/io/__pycache__/_playback.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/io/__pycache__/_playback.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd8a75dade0fa0f3ca4efeda772aa6bbe7e68ab8
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/io/__pycache__/_playback.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/io/_effector.py b/MLPY/Lib/site-packages/torchaudio/io/_effector.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ce4a448c7cd2755cb8c56f19dbb987486f7618
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/io/_effector.py
@@ -0,0 +1,347 @@
+import io
+from typing import Iterator, List, Optional
+
+import torch
+from torch import Tensor
+
+from torio.io._streaming_media_decoder import _get_afilter_desc, StreamingMediaDecoder as StreamReader
+from torio.io._streaming_media_encoder import CodecConfig, StreamingMediaEncoder as StreamWriter
+
+
+class _StreamingIOBuffer:
+    """Streaming Bytes IO buffer. Data are dropped when read."""
+
+    def __init__(self):
+        self._buffer: List(bytes) = []
+
+    def write(self, b: bytes):
+        if b:
+            self._buffer.append(b)
+        return len(b)
+
+    def pop(self, n):
+        """Pop the oldest byte string. It does not necessary return the requested amount"""
+        if not self._buffer:
+            return b""
+        if len(self._buffer[0]) <= n:
+            return self._buffer.pop(0)
+        ret = self._buffer[0][:n]
+        self._buffer[0] = self._buffer[0][n:]
+        return ret
+
+
+def _get_sample_fmt(dtype: torch.dtype):
+    types = {
+        torch.uint8: "u8",
+        torch.int16: "s16",
+        torch.int32: "s32",
+        torch.float32: "flt",
+        torch.float64: "dbl",
+    }
+    if dtype not in types:
+        raise ValueError(f"Unsupported dtype is provided {dtype}. Supported dtypes are: {types.keys()}")
+    return types[dtype]
+
+
+class _AudioStreamingEncoder:
+    """Given a waveform, encode on-demand and return bytes"""
+
+    def __init__(
+        self,
+        src: Tensor,
+        sample_rate: int,
+        effect: str,
+        muxer: str,
+        encoder: Optional[str],
+        codec_config: Optional[CodecConfig],
+        frames_per_chunk: int,
+    ):
+        self.src = src
+        self.buffer = _StreamingIOBuffer()
+        self.writer = StreamWriter(self.buffer, format=muxer)
+        self.writer.add_audio_stream(
+            num_channels=src.size(1),
+            sample_rate=sample_rate,
+            format=_get_sample_fmt(src.dtype),
+            encoder=encoder,
+            filter_desc=effect,
+            codec_config=codec_config,
+        )
+        self.writer.open()
+        self.fpc = frames_per_chunk
+
+        # index on the input tensor (along time-axis)
+        # we use -1 to indicate that we finished iterating the tensor and
+        # the writer is closed.
+        self.i_iter = 0
+
+    def read(self, n):
+        while not self.buffer._buffer and self.i_iter >= 0:
+            self.writer.write_audio_chunk(0, self.src[self.i_iter : self.i_iter + self.fpc])
+            self.i_iter += self.fpc
+            if self.i_iter >= self.src.size(0):
+                self.writer.flush()
+                self.writer.close()
+                self.i_iter = -1
+        return self.buffer.pop(n)
+
+
+def _encode(
+    src: Tensor,
+    sample_rate: int,
+    effect: str,
+    muxer: str,
+    encoder: Optional[str],
+    codec_config: Optional[CodecConfig],
+):
+    buffer = io.BytesIO()
+    writer = StreamWriter(buffer, format=muxer)
+    writer.add_audio_stream(
+        num_channels=src.size(1),
+        sample_rate=sample_rate,
+        format=_get_sample_fmt(src.dtype),
+        encoder=encoder,
+        filter_desc=effect,
+        codec_config=codec_config,
+    )
+    with writer.open():
+        writer.write_audio_chunk(0, src)
+    buffer.seek(0)
+    return buffer
+
+
+def _get_muxer(dtype: torch.dtype):
+    # TODO: check if this works in Windows.
+    types = {
+        torch.uint8: "u8",
+        torch.int16: "s16le",
+        torch.int32: "s32le",
+        torch.float32: "f32le",
+        torch.float64: "f64le",
+    }
+    if dtype not in types:
+        raise ValueError(f"Unsupported dtype is provided {dtype}. Supported dtypes are: {types.keys()}")
+    return types[dtype]
+
+
+class AudioEffector:
+    """Apply various filters and/or codecs to waveforms.
+
+    .. versionadded:: 2.1
+
+    Args:
+        effect (str or None, optional): Filter expressions or ``None`` to apply no filter.
+            See https://ffmpeg.org/ffmpeg-filters.html#Audio-Filters for the
+            details of filter syntax.
+
+        format (str or None, optional): When provided, encode the audio into the
+            corresponding format. Default: ``None``.
+
+        encoder (str or None, optional): When provided, override the encoder used
+            by the ``format``. Default: ``None``.
+
+        codec_config (CodecConfig or None, optional): When provided, configure the encoding codec.
+            Should be provided in conjunction with ``format`` option.
+
+        pad_end (bool, optional): When enabled, and if the waveform becomes shorter after applying
+            effects/codec, then pad the end with silence.
+
+    Example - Basic usage
+        To use ``AudioEffector``, first instantiate it with a set of
+        ``effect`` and ``format``.
+
+        >>> # instantiate the effector
+        >>> effector = AudioEffector(effect=..., format=...)
+
+        Then, use :py:meth:`~AudioEffector.apply` or :py:meth:`~AudioEffector.stream`
+        method to apply them.
+
+        >>> # Apply the effect to the whole waveform
+        >>> applied = effector.apply(waveform, sample_rate)
+
+        >>> # Apply the effect chunk-by-chunk
+        >>> for chunk in effector.stream(waveform, sample_rate):
+        >>>    ...
+
+    Example - Applying effects
+        Please refer to
+        https://ffmpeg.org/ffmpeg-filters.html#Filtergraph-description
+        for the overview of filter description, and
+        https://ffmpeg.org/ffmpeg-filters.html#toc-Audio-Filters
+        for the list of available filters.
+
+        Tempo - https://ffmpeg.org/ffmpeg-filters.html#atempo
+
+        >>> AudioEffector(effect="atempo=1.5")
+
+        Echo - https://ffmpeg.org/ffmpeg-filters.html#aecho
+
+        >>> AudioEffector(effect="aecho=0.8:0.88:60:0.4")
+
+        Flanger - https://ffmpeg.org/ffmpeg-filters.html#flanger
+
+        >>> AudioEffector(effect="aflanger")
+
+        Vibrato - https://ffmpeg.org/ffmpeg-filters.html#vibrato
+
+        >>> AudioEffector(effect="vibrato")
+
+        Tremolo - https://ffmpeg.org/ffmpeg-filters.html#tremolo
+
+        >>> AudioEffector(effect="vibrato")
+
+        You can also apply multiple effects at once.
+
+        >>> AudioEffector(effect="")
+
+    Example - Applying codec
+        One can apply codec using ``format`` argument. ``format`` can be
+        audio format or container format. If the container format supports
+        multiple encoders, you can specify it with ``encoder`` argument.
+
+        Wav format
+        (no compression is applied but samples are converted to
+        16-bit signed integer)
+
+        >>> AudioEffector(format="wav")
+
+        Ogg format with default encoder
+
+        >>> AudioEffector(format="ogg")
+
+        Ogg format with vorbis
+
+        >>> AudioEffector(format="ogg", encoder="vorbis")
+
+        Ogg format with opus
+
+        >>> AudioEffector(format="ogg", encoder="opus")
+
+        Webm format with opus
+
+        >>> AudioEffector(format="webm", encoder="opus")
+
+    Example - Applying codec with configuration
+        Reference: https://trac.ffmpeg.org/wiki/Encode/MP3
+
+        MP3 with default config
+
+        >>> AudioEffector(format="mp3")
+
+        MP3 with variable bitrate
+
+        >>> AudioEffector(format="mp3", codec_config=CodecConfig(qscale=5))
+
+        MP3 with constant bitrate
+
+        >>> AudioEffector(format="mp3", codec_config=CodecConfig(bit_rate=32_000))
+    """
+
+    def __init__(
+        self,
+        effect: Optional[str] = None,
+        format: Optional[str] = None,
+        *,
+        encoder: Optional[str] = None,
+        codec_config: Optional[CodecConfig] = None,
+        pad_end: bool = True,
+    ):
+        if format is None:
+            if encoder is not None or codec_config is not None:
+                raise ValueError("`encoder` and/or `condec_config` opions are provided without `format` option.")
+        self.effect = effect
+        self.format = format
+        self.encoder = encoder
+        self.codec_config = codec_config
+        self.pad_end = pad_end
+
+    def _get_reader(self, waveform, sample_rate, output_sample_rate, frames_per_chunk=None):
+        num_frames, num_channels = waveform.shape
+
+        if self.format is not None:
+            muxer = self.format
+            encoder = self.encoder
+            option = {}
+            # Some formats are headerless, so need to provide these infomation.
+            if self.format == "mulaw":
+                option = {"sample_rate": f"{sample_rate}", "channels": f"{num_channels}"}
+
+        else:  # PCM
+            muxer = _get_muxer(waveform.dtype)
+            encoder = None
+            option = {"sample_rate": f"{sample_rate}", "channels": f"{num_channels}"}
+
+        if frames_per_chunk is None:
+            src = _encode(waveform, sample_rate, self.effect, muxer, encoder, self.codec_config)
+        else:
+            src = _AudioStreamingEncoder(
+                waveform, sample_rate, self.effect, muxer, encoder, self.codec_config, frames_per_chunk
+            )
+
+        output_sr = sample_rate if output_sample_rate is None else output_sample_rate
+        filter_desc = _get_afilter_desc(output_sr, _get_sample_fmt(waveform.dtype), num_channels)
+        if self.pad_end:
+            filter_desc = f"{filter_desc},apad=whole_len={num_frames}"
+
+        reader = StreamReader(src, format=muxer, option=option)
+        reader.add_audio_stream(frames_per_chunk or -1, -1, filter_desc=filter_desc)
+        return reader
+
+    def apply(self, waveform: Tensor, sample_rate: int, output_sample_rate: Optional[int] = None) -> Tensor:
+        """Apply the effect and/or codecs to the whole tensor.
+
+        Args:
+            waveform (Tensor): The input waveform. Shape: ``(time, channel)``
+            sample_rate (int): Sample rate of the input waveform.
+            output_sample_rate (int or None, optional): Output sample rate.
+                If provided, override the output sample rate.
+                Otherwise, the resulting tensor is resampled to have
+                the same sample rate as the input.
+                Default: ``None``.
+
+        Returns:
+            Tensor:
+                Resulting Tensor. Shape: ``(time, channel)``. The number of frames
+                could be different from that of the input.
+        """
+        if waveform.ndim != 2:
+            raise ValueError(f"Expected the input waveform to be 2D. Found: {waveform.ndim}")
+
+        if waveform.numel() == 0:
+            return waveform
+
+        reader = self._get_reader(waveform, sample_rate, output_sample_rate)
+        reader.process_all_packets()
+        (applied,) = reader.pop_chunks()
+        return Tensor(applied)
+
+    def stream(
+        self, waveform: Tensor, sample_rate: int, frames_per_chunk: int, output_sample_rate: Optional[int] = None
+    ) -> Iterator[Tensor]:
+        """Apply the effect and/or codecs to the given tensor chunk by chunk.
+
+        Args:
+            waveform (Tensor): The input waveform. Shape: ``(time, channel)``
+            sample_rate (int): Sample rate of the waveform.
+            frames_per_chunk (int): The number of frames to return at a time.
+            output_sample_rate (int or None, optional): Output sample rate.
+                If provided, override the output sample rate.
+                Otherwise, the resulting tensor is resampled to have
+                the same sample rate as the input.
+                Default: ``None``.
+
+        Returns:
+            Iterator[Tensor]:
+                Series of processed chunks. Shape: ``(time, channel)``, where the
+                the number of frames matches ``frames_per_chunk`` except the
+                last chunk, which could be shorter.
+        """
+        if waveform.ndim != 2:
+            raise ValueError(f"Expected the input waveform to be 2D. Found: {waveform.ndim}")
+
+        if waveform.numel() == 0:
+            return waveform
+
+        reader = self._get_reader(waveform, sample_rate, output_sample_rate, frames_per_chunk)
+        for (applied,) in reader.stream():
+            yield Tensor(applied)
diff --git a/MLPY/Lib/site-packages/torchaudio/io/_playback.py b/MLPY/Lib/site-packages/torchaudio/io/_playback.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d1ab630e5fdbb9e44e9b55583511cf7ef1640bf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/io/_playback.py
@@ -0,0 +1,72 @@
+import warnings
+from sys import platform
+from typing import Optional
+
+import torch
+import torchaudio
+
+dict_format = {
+    torch.uint8: "u8",
+    torch.int16: "s16",
+    torch.int32: "s32",
+    torch.int64: "s64",
+    torch.float32: "flt",
+    torch.float64: "dbl",
+}
+
+
+def play_audio(
+    waveform: torch.Tensor,
+    sample_rate: Optional[float],
+    device: Optional[str] = None,
+) -> None:
+    """Plays audio through specified or available output device.
+
+    .. warning::
+       This function is currently only supported on MacOS, and requires
+       libavdevice (FFmpeg) with ``audiotoolbox`` output device.
+
+    .. note::
+       This function can play up to two audio channels.
+
+    Args:
+        waveform: Tensor containing the audio to play.
+            Expected shape: `(time, num_channels)`.
+        sample_rate: Sample rate of the audio to play.
+        device: Output device to use. If None, the default device is used.
+    """
+
+    if platform == "darwin":
+        device = device or "audiotoolbox"
+        path = "-"
+    else:
+        raise ValueError(f"This function only supports MacOS, but current OS is {platform}")
+
+    available_devices = list(torchaudio.utils.ffmpeg_utils.get_output_devices().keys())
+    if device not in available_devices:
+        raise ValueError(f"Device {device} is not available. Available devices are: {available_devices}")
+
+    if waveform.dtype not in dict_format:
+        raise ValueError(f"Unsupported type {waveform.dtype}. The list of supported types is: {dict_format.keys()}")
+    format = dict_format[waveform.dtype]
+
+    if waveform.ndim != 2:
+        raise ValueError(f"Expected 2D tensor with shape `(time, num_channels)`, got {waveform.ndim}D tensor instead")
+
+    time, num_channels = waveform.size()
+    if num_channels > 2:
+        warnings.warn(
+            f"Expected up to 2 channels, got {num_channels} channels instead. "
+            "Only the first 2 channels will be played.",
+            stacklevel=2,
+        )
+
+    # Write to speaker device
+    s = torchaudio.io.StreamWriter(dst=path, format=device)
+    s.add_audio_stream(sample_rate, num_channels, format=format)
+
+    # write audio to the device
+    block_size = 256
+    with s.open():
+        for i in range(0, time, block_size):
+            s.write_audio_chunk(0, waveform[i : i + block_size, :])
diff --git a/MLPY/Lib/site-packages/torchaudio/kaldi_io.py b/MLPY/Lib/site-packages/torchaudio/kaldi_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb27f439d56d6901f6c7fd1f6b31fd310c58efe0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/kaldi_io.py
@@ -0,0 +1,144 @@
+# To use this file, the dependency (https://github.com/vesis84/kaldi-io-for-python)
+# needs to be installed. This is a light wrapper around kaldi_io that returns
+# torch.Tensors.
+from typing import Any, Callable, Iterable, Tuple
+
+import torch
+from torch import Tensor
+from torchaudio._internal import module_utils as _mod_utils
+
+if _mod_utils.is_module_available("numpy"):
+    import numpy as np
+
+
+__all__ = [
+    "read_vec_int_ark",
+    "read_vec_flt_scp",
+    "read_vec_flt_ark",
+    "read_mat_scp",
+    "read_mat_ark",
+]
+
+
+def _convert_method_output_to_tensor(
+    file_or_fd: Any, fn: Callable, convert_contiguous: bool = False
+) -> Iterable[Tuple[str, Tensor]]:
+    r"""Takes a method invokes it. The output is converted to a tensor.
+
+    Args:
+        file_or_fd (str/FileDescriptor): File name or file descriptor
+        fn (Callable): Function that has the signature (file name/descriptor) and converts it to
+            Iterable[Tuple[str, Tensor]].
+        convert_contiguous (bool, optional): Determines whether the array should be converted into a
+            contiguous layout. (Default: ``False``)
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is vec/mat
+    """
+    for key, np_arr in fn(file_or_fd):
+        if convert_contiguous:
+            np_arr = np.ascontiguousarray(np_arr)
+        yield key, torch.from_numpy(np_arr)
+
+
+@_mod_utils.requires_module("kaldi_io", "numpy")
+def read_vec_int_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
+    r"""Create generator of (key,vector<int>) tuples, which reads from the ark file/stream.
+
+    Args:
+        file_or_fd (str/FileDescriptor): ark, gzipped ark, pipe or opened file descriptor
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is the vector read from file
+
+    Example
+        >>> # read ark to a 'dictionary'
+        >>> d = { u:d for u,d in torchaudio.kaldi_io.read_vec_int_ark(file) }
+    """
+
+    import kaldi_io
+
+    # Requires convert_contiguous to be True because elements from int32 vector are
+    # sorted in tuples: (sizeof(int32), value) so strides are (5,) instead of (4,) which will throw an error
+    # in from_numpy as it expects strides to be a multiple of 4 (int32).
+    return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_int_ark, convert_contiguous=True)
+
+
+@_mod_utils.requires_module("kaldi_io", "numpy")
+def read_vec_flt_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
+    r"""Create generator of (key,vector<float32/float64>) tuples, read according to Kaldi scp.
+
+    Args:
+        file_or_fd (str/FileDescriptor): scp, gzipped scp, pipe or opened file descriptor
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is the vector read from file
+
+    Example
+        >>> # read scp to a 'dictionary'
+        >>> # d = { u:d for u,d in torchaudio.kaldi_io.read_vec_flt_scp(file) }
+    """
+
+    import kaldi_io
+
+    return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_flt_scp)
+
+
+@_mod_utils.requires_module("kaldi_io", "numpy")
+def read_vec_flt_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
+    r"""Create generator of (key,vector<float32/float64>) tuples, which reads from the ark file/stream.
+
+    Args:
+        file_or_fd (str/FileDescriptor): ark, gzipped ark, pipe or opened file descriptor
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is the vector read from file
+
+    Example
+        >>> # read ark to a 'dictionary'
+        >>> d = { u:d for u,d in torchaudio.kaldi_io.read_vec_flt_ark(file) }
+    """
+
+    import kaldi_io
+
+    return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_flt_ark)
+
+
+@_mod_utils.requires_module("kaldi_io", "numpy")
+def read_mat_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
+    r"""Create generator of (key,matrix<float32/float64>) tuples, read according to Kaldi scp.
+
+    Args:
+        file_or_fd (str/FileDescriptor): scp, gzipped scp, pipe or opened file descriptor
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is the matrix read from file
+
+    Example
+        >>> # read scp to a 'dictionary'
+        >>> d = { u:d for u,d in torchaudio.kaldi_io.read_mat_scp(file) }
+    """
+
+    import kaldi_io
+
+    return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_mat_scp)
+
+
+@_mod_utils.requires_module("kaldi_io", "numpy")
+def read_mat_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
+    r"""Create generator of (key,matrix<float32/float64>) tuples, which reads from the ark file/stream.
+
+    Args:
+        file_or_fd (str/FileDescriptor): ark, gzipped ark, pipe or opened file descriptor
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is the matrix read from file
+
+    Example
+        >>> # read ark to a 'dictionary'
+        >>> d = { u:d for u,d in torchaudio.kaldi_io.read_mat_ark(file) }
+    """
+
+    import kaldi_io
+
+    return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_mat_ark)
diff --git a/MLPY/Lib/site-packages/torchaudio/lib/__init__.py b/MLPY/Lib/site-packages/torchaudio/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchaudio/lib/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/lib/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..368264bef6c289417d0d28ead173c6b1e26028e3
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/lib/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/lib/_torchaudio.pyd b/MLPY/Lib/site-packages/torchaudio/lib/_torchaudio.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..a8b9ceff7558b4d4e1ffca18495e8bb28be06c3a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/lib/_torchaudio.pyd differ
diff --git a/MLPY/Lib/site-packages/torchaudio/lib/libtorchaudio.pyd b/MLPY/Lib/site-packages/torchaudio/lib/libtorchaudio.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..56bab76387d4c6fd51c32ac2bb217b39dd04fee6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/lib/libtorchaudio.pyd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96cf4ebde7e4c6995d505841fa780dd5cec8008f2d219c32e90f2337c70b634d
+size 1052672
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__init__.py b/MLPY/Lib/site-packages/torchaudio/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..969486880ab98615b53bcfabf9cc596bb3cd33c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/__init__.py
@@ -0,0 +1,85 @@
+from ._hdemucs import HDemucs, hdemucs_high, hdemucs_low, hdemucs_medium
+from .conformer import Conformer
+from .conv_tasnet import conv_tasnet_base, ConvTasNet
+from .deepspeech import DeepSpeech
+from .emformer import Emformer
+from .rnnt import emformer_rnnt_base, emformer_rnnt_model, RNNT
+from .rnnt_decoder import Hypothesis, RNNTBeamSearch
+from .squim import (
+    squim_objective_base,
+    squim_objective_model,
+    squim_subjective_base,
+    squim_subjective_model,
+    SquimObjective,
+    SquimSubjective,
+)
+from .tacotron2 import Tacotron2
+from .wav2letter import Wav2Letter
+from .wav2vec2 import (
+    hubert_base,
+    hubert_large,
+    hubert_pretrain_base,
+    hubert_pretrain_large,
+    hubert_pretrain_model,
+    hubert_pretrain_xlarge,
+    hubert_xlarge,
+    HuBERTPretrainModel,
+    wav2vec2_base,
+    wav2vec2_large,
+    wav2vec2_large_lv60k,
+    wav2vec2_model,
+    wav2vec2_xlsr_1b,
+    wav2vec2_xlsr_2b,
+    wav2vec2_xlsr_300m,
+    Wav2Vec2Model,
+    wavlm_base,
+    wavlm_large,
+    wavlm_model,
+)
+from .wavernn import WaveRNN
+
+
+__all__ = [
+    "Wav2Letter",
+    "WaveRNN",
+    "ConvTasNet",
+    "conv_tasnet_base",
+    "DeepSpeech",
+    "Wav2Vec2Model",
+    "HuBERTPretrainModel",
+    "wavlm_model",
+    "wavlm_base",
+    "wavlm_large",
+    "wav2vec2_model",
+    "wav2vec2_base",
+    "wav2vec2_large",
+    "wav2vec2_large_lv60k",
+    "hubert_base",
+    "hubert_large",
+    "hubert_xlarge",
+    "hubert_pretrain_model",
+    "hubert_pretrain_base",
+    "hubert_pretrain_large",
+    "hubert_pretrain_xlarge",
+    "wav2vec2_xlsr_300m",
+    "wav2vec2_xlsr_1b",
+    "wav2vec2_xlsr_2b",
+    "Tacotron2",
+    "Conformer",
+    "Emformer",
+    "Hypothesis",
+    "RNNT",
+    "RNNTBeamSearch",
+    "emformer_rnnt_base",
+    "emformer_rnnt_model",
+    "HDemucs",
+    "hdemucs_low",
+    "hdemucs_medium",
+    "hdemucs_high",
+    "squim_objective_base",
+    "squim_objective_model",
+    "squim_subjective_base",
+    "squim_subjective_model",
+    "SquimObjective",
+    "SquimSubjective",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c6366a8ecf0289137968a841ee09516cf04d03d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/_hdemucs.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/_hdemucs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ef807b2fbc8394ac62c0f4461f71ad168eb059c
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/_hdemucs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/conformer.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/conformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc689f0d1a3d13ae6d9e583ff080edfb12a0be22
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/conformer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/conv_tasnet.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/conv_tasnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f62b15e0f26075ecc8c921b05e24c72cc3447fb9
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/conv_tasnet.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/deepspeech.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/deepspeech.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1027dbbd7bf46241cf8e9769d3fa8ea01d90e80
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/deepspeech.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/emformer.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/emformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e82bafe04e30c00751594bfab33bc6e283a6fea6
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/emformer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/rnnt.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/rnnt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a80412f201cbecf4c60539653500f402960d8bc4
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/rnnt.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/rnnt_decoder.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/rnnt_decoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cfe61fdbf7845ee34687cffc7b22266a4075ea3
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/rnnt_decoder.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/tacotron2.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/tacotron2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..124a202d6a70769b9def4b0b9e1440f363b2f47f
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/tacotron2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/wav2letter.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/wav2letter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4912630fdb0a2da156c28abde854d6c4a096251a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/wav2letter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/__pycache__/wavernn.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/wavernn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da2ad36a50171f07958fa487895a54eeb4394254
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/__pycache__/wavernn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/_hdemucs.py b/MLPY/Lib/site-packages/torchaudio/models/_hdemucs.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff081b3d402942dd6b56b96e930744d2a7457aa0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/_hdemucs.py
@@ -0,0 +1,1008 @@
+# *****************************************************************************
+# MIT License
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# *****************************************************************************
+
+
+import math
+import typing as tp
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class _ScaledEmbedding(torch.nn.Module):
+    r"""Make continuous embeddings and boost learning rate
+
+    Args:
+        num_embeddings (int): number of embeddings
+        embedding_dim (int): embedding dimensions
+        scale (float, optional): amount to scale learning rate (Default: 10.0)
+        smooth (bool, optional): choose to apply smoothing (Default: ``False``)
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, scale: float = 10.0, smooth: bool = False):
+        super().__init__()
+        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
+        if smooth:
+            weight = torch.cumsum(self.embedding.weight.data, dim=0)
+            # when summing gaussian, scale raises as sqrt(n), so we normalize by that.
+            weight = weight / torch.arange(1, num_embeddings + 1).sqrt()[:, None]
+            self.embedding.weight.data[:] = weight
+        self.embedding.weight.data /= scale
+        self.scale = scale
+
+    @property
+    def weight(self) -> torch.Tensor:
+        return self.embedding.weight * self.scale
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""Forward pass for embedding with scale.
+        Args:
+            x (torch.Tensor): input tensor of shape `(num_embeddings)`
+
+        Returns:
+            (Tensor):
+                Embedding output of shape `(num_embeddings, embedding_dim)`
+        """
+        out = self.embedding(x) * self.scale
+        return out
+
+
+class _HEncLayer(torch.nn.Module):
+
+    r"""Encoder layer. This used both by the time and the frequency branch.
+    Args:
+        chin (int): number of input channels.
+        chout (int): number of output channels.
+        kernel_size (int, optional): Kernel size for encoder (Default: 8)
+        stride (int, optional): Stride for encoder layer (Default: 4)
+        norm_groups (int, optional): number of groups for group norm. (Default: 4)
+        empty (bool, optional): used to make a layer with just the first conv. this is used
+            before merging the time and freq. branches. (Default: ``False``)
+        freq (bool, optional): boolean for whether conv layer is for frequency domain (Default: ``True``)
+        norm_type (string, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
+        context (int, optional): context size for the 1x1 conv. (Default: 0)
+        dconv_kw (Dict[str, Any] or None, optional): dictionary of kwargs for the DConv class. (Default: ``None``)
+        pad (bool, optional): true to pad the input. Padding is done so that the output size is
+            always the input size / stride. (Default: ``True``)
+    """
+
+    def __init__(
+        self,
+        chin: int,
+        chout: int,
+        kernel_size: int = 8,
+        stride: int = 4,
+        norm_groups: int = 4,
+        empty: bool = False,
+        freq: bool = True,
+        norm_type: str = "group_norm",
+        context: int = 0,
+        dconv_kw: Optional[Dict[str, Any]] = None,
+        pad: bool = True,
+    ):
+        super().__init__()
+        if dconv_kw is None:
+            dconv_kw = {}
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm_type == "group_norm":
+            norm_fn = lambda d: nn.GroupNorm(norm_groups, d)  # noqa
+        pad_val = kernel_size // 4 if pad else 0
+        klass = nn.Conv1d
+        self.freq = freq
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.empty = empty
+        self.pad = pad_val
+        if freq:
+            kernel_size = [kernel_size, 1]
+            stride = [stride, 1]
+            pad_val = [pad_val, 0]
+            klass = nn.Conv2d
+        self.conv = klass(chin, chout, kernel_size, stride, pad_val)
+        self.norm1 = norm_fn(chout)
+
+        if self.empty:
+            self.rewrite = nn.Identity()
+            self.norm2 = nn.Identity()
+            self.dconv = nn.Identity()
+        else:
+            self.rewrite = klass(chout, 2 * chout, 1 + 2 * context, 1, context)
+            self.norm2 = norm_fn(2 * chout)
+            self.dconv = _DConv(chout, **dconv_kw)
+
+    def forward(self, x: torch.Tensor, inject: Optional[torch.Tensor] = None) -> torch.Tensor:
+        r"""Forward pass for encoding layer.
+
+        Size depends on whether frequency or time
+
+        Args:
+            x (torch.Tensor): tensor input of shape `(B, C, F, T)` for frequency and shape
+                `(B, C, T)` for time
+            inject (torch.Tensor, optional): on last layer, combine frequency and time branches through inject param,
+                same shape as x (default: ``None``)
+
+        Returns:
+            Tensor
+                output tensor after encoder layer of shape `(B, C, F / stride, T)` for frequency
+                    and shape `(B, C, ceil(T / stride))` for time
+        """
+
+        if not self.freq and x.dim() == 4:
+            B, C, Fr, T = x.shape
+            x = x.view(B, -1, T)
+
+        if not self.freq:
+            le = x.shape[-1]
+            if not le % self.stride == 0:
+                x = F.pad(x, (0, self.stride - (le % self.stride)))
+        y = self.conv(x)
+        if self.empty:
+            return y
+        if inject is not None:
+            if inject.shape[-1] != y.shape[-1]:
+                raise ValueError("Injection shapes do not align")
+            if inject.dim() == 3 and y.dim() == 4:
+                inject = inject[:, :, None]
+            y = y + inject
+        y = F.gelu(self.norm1(y))
+        if self.freq:
+            B, C, Fr, T = y.shape
+            y = y.permute(0, 2, 1, 3).reshape(-1, C, T)
+            y = self.dconv(y)
+            y = y.view(B, Fr, C, T).permute(0, 2, 1, 3)
+        else:
+            y = self.dconv(y)
+        z = self.norm2(self.rewrite(y))
+        z = F.glu(z, dim=1)
+        return z
+
+
+class _HDecLayer(torch.nn.Module):
+    r"""Decoder layer. This used both by the time and the frequency branches.
+    Args:
+        chin (int): number of input channels.
+        chout (int): number of output channels.
+        last (bool, optional): whether current layer is final layer (Default: ``False``)
+        kernel_size (int, optional): Kernel size for encoder (Default: 8)
+        stride (int): Stride for encoder layer (Default: 4)
+        norm_groups (int, optional): number of groups for group norm. (Default: 1)
+        empty (bool, optional): used to make a layer with just the first conv. this is used
+            before merging the time and freq. branches. (Default: ``False``)
+        freq (bool, optional): boolean for whether conv layer is for frequency (Default: ``True``)
+        norm_type (str, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
+        context (int, optional): context size for the 1x1 conv. (Default: 1)
+        dconv_kw (Dict[str, Any] or None, optional): dictionary of kwargs for the DConv class. (Default: ``None``)
+        pad (bool, optional): true to pad the input. Padding is done so that the output size is
+            always the input size / stride. (Default: ``True``)
+    """
+
+    def __init__(
+        self,
+        chin: int,
+        chout: int,
+        last: bool = False,
+        kernel_size: int = 8,
+        stride: int = 4,
+        norm_groups: int = 1,
+        empty: bool = False,
+        freq: bool = True,
+        norm_type: str = "group_norm",
+        context: int = 1,
+        dconv_kw: Optional[Dict[str, Any]] = None,
+        pad: bool = True,
+    ):
+        super().__init__()
+        if dconv_kw is None:
+            dconv_kw = {}
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm_type == "group_norm":
+            norm_fn = lambda d: nn.GroupNorm(norm_groups, d)  # noqa
+        if pad:
+            if (kernel_size - stride) % 2 != 0:
+                raise ValueError("Kernel size and stride do not align")
+            pad = (kernel_size - stride) // 2
+        else:
+            pad = 0
+        self.pad = pad
+        self.last = last
+        self.freq = freq
+        self.chin = chin
+        self.empty = empty
+        self.stride = stride
+        self.kernel_size = kernel_size
+        klass = nn.Conv1d
+        klass_tr = nn.ConvTranspose1d
+        if freq:
+            kernel_size = [kernel_size, 1]
+            stride = [stride, 1]
+            klass = nn.Conv2d
+            klass_tr = nn.ConvTranspose2d
+        self.conv_tr = klass_tr(chin, chout, kernel_size, stride)
+        self.norm2 = norm_fn(chout)
+        if self.empty:
+            self.rewrite = nn.Identity()
+            self.norm1 = nn.Identity()
+        else:
+            self.rewrite = klass(chin, 2 * chin, 1 + 2 * context, 1, context)
+            self.norm1 = norm_fn(2 * chin)
+
+    def forward(self, x: torch.Tensor, skip: Optional[torch.Tensor], length):
+        r"""Forward pass for decoding layer.
+
+        Size depends on whether frequency or time
+
+        Args:
+            x (torch.Tensor): tensor input of shape `(B, C, F, T)` for frequency and shape
+                `(B, C, T)` for time
+            skip (torch.Tensor, optional): on first layer, separate frequency and time branches using param
+                (default: ``None``)
+            length (int): Size of tensor for output
+
+        Returns:
+            (Tensor, Tensor):
+                Tensor
+                    output tensor after decoder layer of shape `(B, C, F * stride, T)` for frequency domain except last
+                        frequency layer shape is `(B, C, kernel_size, T)`. Shape is `(B, C, stride * T)`
+                        for time domain.
+                Tensor
+                    contains the output just before final transposed convolution, which is used when the
+                        freq. and time branch separate. Otherwise, does not matter. Shape is
+                        `(B, C, F, T)` for frequency and `(B, C, T)` for time.
+        """
+        if self.freq and x.dim() == 3:
+            B, C, T = x.shape
+            x = x.view(B, self.chin, -1, T)
+
+        if not self.empty:
+            x = x + skip
+            y = F.glu(self.norm1(self.rewrite(x)), dim=1)
+        else:
+            y = x
+            if skip is not None:
+                raise ValueError("Skip must be none when empty is true.")
+
+        z = self.norm2(self.conv_tr(y))
+        if self.freq:
+            if self.pad:
+                z = z[..., self.pad : -self.pad, :]
+        else:
+            z = z[..., self.pad : self.pad + length]
+            if z.shape[-1] != length:
+                raise ValueError("Last index of z must be equal to length")
+        if not self.last:
+            z = F.gelu(z)
+
+        return z, y
+
+
+class HDemucs(torch.nn.Module):
+    r"""Hybrid Demucs model from
+    *Hybrid Spectrogram and Waveform Source Separation* :cite:`defossez2021hybrid`.
+
+    See Also:
+        * :class:`torchaudio.pipelines.SourceSeparationBundle`: Source separation pipeline with pre-trained models.
+
+    Args:
+        sources (List[str]): list of source names. List can contain the following source
+            options: [``"bass"``, ``"drums"``, ``"other"``, ``"mixture"``, ``"vocals"``].
+        audio_channels (int, optional): input/output audio channels. (Default: 2)
+        channels (int, optional): initial number of hidden channels. (Default: 48)
+        growth (int, optional): increase the number of hidden channels by this factor at each layer. (Default: 2)
+        nfft (int, optional): number of fft bins. Note that changing this requires careful computation of
+            various shape parameters and will not work out of the box for hybrid models. (Default: 4096)
+        depth (int, optional): number of layers in encoder and decoder (Default: 6)
+        freq_emb (float, optional): add frequency embedding after the first frequency layer if > 0,
+            the actual value controls the weight of the embedding. (Default: 0.2)
+        emb_scale (int, optional): equivalent to scaling the embedding learning rate (Default: 10)
+        emb_smooth (bool, optional): initialize the embedding with a smooth one (with respect to frequencies).
+            (Default: ``True``)
+        kernel_size (int, optional): kernel_size for encoder and decoder layers. (Default: 8)
+        time_stride (int, optional): stride for the final time layer, after the merge. (Default: 2)
+        stride (int, optional): stride for encoder and decoder layers. (Default: 4)
+        context (int, optional): context for 1x1 conv in the decoder. (Default: 4)
+        context_enc (int, optional): context for 1x1 conv in the encoder. (Default: 0)
+        norm_starts (int, optional): layer at which group norm starts being used.
+            decoder layers are numbered in reverse order. (Default: 4)
+        norm_groups (int, optional): number of groups for group norm. (Default: 4)
+        dconv_depth (int, optional): depth of residual DConv branch. (Default: 2)
+        dconv_comp (int, optional): compression of DConv branch. (Default: 4)
+        dconv_attn (int, optional): adds attention layers in DConv branch starting at this layer. (Default: 4)
+        dconv_lstm (int, optional): adds a LSTM layer in DConv branch starting at this layer. (Default: 4)
+        dconv_init (float, optional): initial scale for the DConv branch LayerScale. (Default: 1e-4)
+    """
+
+    def __init__(
+        self,
+        sources: List[str],
+        audio_channels: int = 2,
+        channels: int = 48,
+        growth: int = 2,
+        nfft: int = 4096,
+        depth: int = 6,
+        freq_emb: float = 0.2,
+        emb_scale: int = 10,
+        emb_smooth: bool = True,
+        kernel_size: int = 8,
+        time_stride: int = 2,
+        stride: int = 4,
+        context: int = 1,
+        context_enc: int = 0,
+        norm_starts: int = 4,
+        norm_groups: int = 4,
+        dconv_depth: int = 2,
+        dconv_comp: int = 4,
+        dconv_attn: int = 4,
+        dconv_lstm: int = 4,
+        dconv_init: float = 1e-4,
+    ):
+        super().__init__()
+        self.depth = depth
+        self.nfft = nfft
+        self.audio_channels = audio_channels
+        self.sources = sources
+        self.kernel_size = kernel_size
+        self.context = context
+        self.stride = stride
+        self.channels = channels
+
+        self.hop_length = self.nfft // 4
+        self.freq_emb = None
+
+        self.freq_encoder = nn.ModuleList()
+        self.freq_decoder = nn.ModuleList()
+
+        self.time_encoder = nn.ModuleList()
+        self.time_decoder = nn.ModuleList()
+
+        chin = audio_channels
+        chin_z = chin * 2  # number of channels for the freq branch
+        chout = channels
+        chout_z = channels
+        freqs = self.nfft // 2
+
+        for index in range(self.depth):
+            lstm = index >= dconv_lstm
+            attn = index >= dconv_attn
+            norm_type = "group_norm" if index >= norm_starts else "none"
+            freq = freqs > 1
+            stri = stride
+            ker = kernel_size
+            if not freq:
+                if freqs != 1:
+                    raise ValueError("When freq is false, freqs must be 1.")
+                ker = time_stride * 2
+                stri = time_stride
+
+            pad = True
+            last_freq = False
+            if freq and freqs <= kernel_size:
+                ker = freqs
+                pad = False
+                last_freq = True
+
+            kw = {
+                "kernel_size": ker,
+                "stride": stri,
+                "freq": freq,
+                "pad": pad,
+                "norm_type": norm_type,
+                "norm_groups": norm_groups,
+                "dconv_kw": {
+                    "lstm": lstm,
+                    "attn": attn,
+                    "depth": dconv_depth,
+                    "compress": dconv_comp,
+                    "init": dconv_init,
+                },
+            }
+            kwt = dict(kw)
+            kwt["freq"] = 0
+            kwt["kernel_size"] = kernel_size
+            kwt["stride"] = stride
+            kwt["pad"] = True
+            kw_dec = dict(kw)
+
+            if last_freq:
+                chout_z = max(chout, chout_z)
+                chout = chout_z
+
+            enc = _HEncLayer(chin_z, chout_z, context=context_enc, **kw)
+            if freq:
+                if last_freq is True and nfft == 2048:
+                    kwt["stride"] = 2
+                    kwt["kernel_size"] = 4
+                tenc = _HEncLayer(chin, chout, context=context_enc, empty=last_freq, **kwt)
+                self.time_encoder.append(tenc)
+
+            self.freq_encoder.append(enc)
+            if index == 0:
+                chin = self.audio_channels * len(self.sources)
+                chin_z = chin * 2
+            dec = _HDecLayer(chout_z, chin_z, last=index == 0, context=context, **kw_dec)
+            if freq:
+                tdec = _HDecLayer(chout, chin, empty=last_freq, last=index == 0, context=context, **kwt)
+                self.time_decoder.insert(0, tdec)
+            self.freq_decoder.insert(0, dec)
+
+            chin = chout
+            chin_z = chout_z
+            chout = int(growth * chout)
+            chout_z = int(growth * chout_z)
+            if freq:
+                if freqs <= kernel_size:
+                    freqs = 1
+                else:
+                    freqs //= stride
+            if index == 0 and freq_emb:
+                self.freq_emb = _ScaledEmbedding(freqs, chin_z, smooth=emb_smooth, scale=emb_scale)
+                self.freq_emb_scale = freq_emb
+
+        _rescale_module(self)
+
+    def _spec(self, x):
+        hl = self.hop_length
+        nfft = self.nfft
+        x0 = x  # noqa
+
+        # We re-pad the signal in order to keep the property
+        # that the size of the output is exactly the size of the input
+        # divided by the stride (here hop_length), when divisible.
+        # This is achieved by padding by 1/4th of the kernel size (here nfft).
+        # which is not supported by torch.stft.
+        # Having all convolution operations follow this convention allow to easily
+        # align the time and frequency branches later on.
+        if hl != nfft // 4:
+            raise ValueError("Hop length must be nfft // 4")
+        le = int(math.ceil(x.shape[-1] / hl))
+        pad = hl // 2 * 3
+        x = self._pad1d(x, pad, pad + le * hl - x.shape[-1], mode="reflect")
+
+        z = _spectro(x, nfft, hl)[..., :-1, :]
+        if z.shape[-1] != le + 4:
+            raise ValueError("Spectrogram's last dimension must be 4 + input size divided by stride")
+        z = z[..., 2 : 2 + le]
+        return z
+
+    def _ispec(self, z, length=None):
+        hl = self.hop_length
+        z = F.pad(z, [0, 0, 0, 1])
+        z = F.pad(z, [2, 2])
+        pad = hl // 2 * 3
+        le = hl * int(math.ceil(length / hl)) + 2 * pad
+        x = _ispectro(z, hl, length=le)
+        x = x[..., pad : pad + length]
+        return x
+
+    def _pad1d(self, x: torch.Tensor, padding_left: int, padding_right: int, mode: str = "zero", value: float = 0.0):
+        """Wrapper around F.pad, in order for reflect padding when num_frames is shorter than max_pad.
+        Add extra zero padding around in order for padding to not break."""
+        length = x.shape[-1]
+        if mode == "reflect":
+            max_pad = max(padding_left, padding_right)
+            if length <= max_pad:
+                x = F.pad(x, (0, max_pad - length + 1))
+        return F.pad(x, (padding_left, padding_right), mode, value)
+
+    def _magnitude(self, z):
+        # move the complex dimension to the channel one.
+        B, C, Fr, T = z.shape
+        m = torch.view_as_real(z).permute(0, 1, 4, 2, 3)
+        m = m.reshape(B, C * 2, Fr, T)
+        return m
+
+    def _mask(self, m):
+        # `m` is a full spectrogram and `z` is ignored.
+        B, S, C, Fr, T = m.shape
+        out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3)
+        out = torch.view_as_complex(out.contiguous())
+        return out
+
+    def forward(self, input: torch.Tensor):
+
+        r"""HDemucs forward call
+
+        Args:
+            input (torch.Tensor): input mixed tensor of shape `(batch_size, channel, num_frames)`
+
+        Returns:
+            Tensor
+                output tensor split into sources of shape `(batch_size, num_sources, channel, num_frames)`
+        """
+
+        if input.ndim != 3:
+            raise ValueError(f"Expected 3D tensor with dimensions (batch, channel, frames). Found: {input.shape}")
+
+        if input.shape[1] != self.audio_channels:
+            raise ValueError(
+                f"The channel dimension of input Tensor must match `audio_channels` of HDemucs model. "
+                f"Found:{input.shape[1]}."
+            )
+
+        x = input
+        length = x.shape[-1]
+
+        z = self._spec(input)
+        mag = self._magnitude(z)
+        x = mag
+
+        B, C, Fq, T = x.shape
+
+        # unlike previous Demucs, we always normalize because it is easier.
+        mean = x.mean(dim=(1, 2, 3), keepdim=True)
+        std = x.std(dim=(1, 2, 3), keepdim=True)
+        x = (x - mean) / (1e-5 + std)
+        # x will be the freq. branch input.
+
+        # Prepare the time branch input.
+        xt = input
+        meant = xt.mean(dim=(1, 2), keepdim=True)
+        stdt = xt.std(dim=(1, 2), keepdim=True)
+        xt = (xt - meant) / (1e-5 + stdt)
+
+        saved = []  # skip connections, freq.
+        saved_t = []  # skip connections, time.
+        lengths: List[int] = []  # saved lengths to properly remove padding, freq branch.
+        lengths_t: List[int] = []  # saved lengths for time branch.
+
+        for idx, encode in enumerate(self.freq_encoder):
+            lengths.append(x.shape[-1])
+            inject = None
+            if idx < len(self.time_encoder):
+                # we have not yet merged branches.
+                lengths_t.append(xt.shape[-1])
+                tenc = self.time_encoder[idx]
+                xt = tenc(xt)
+                if not tenc.empty:
+                    # save for skip connection
+                    saved_t.append(xt)
+                else:
+                    # tenc contains just the first conv., so that now time and freq.
+                    # branches have the same shape and can be merged.
+                    inject = xt
+            x = encode(x, inject)
+            if idx == 0 and self.freq_emb is not None:
+                # add frequency embedding to allow for non equivariant convolutions
+                # over the frequency axis.
+                frs = torch.arange(x.shape[-2], device=x.device)
+                emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x)
+                x = x + self.freq_emb_scale * emb
+
+            saved.append(x)
+
+        x = torch.zeros_like(x)
+        xt = torch.zeros_like(x)
+        # initialize everything to zero (signal will go through u-net skips).
+
+        for idx, decode in enumerate(self.freq_decoder):
+            skip = saved.pop(-1)
+            x, pre = decode(x, skip, lengths.pop(-1))
+            # `pre` contains the output just before final transposed convolution,
+            # which is used when the freq. and time branch separate.
+            offset = self.depth - len(self.time_decoder)
+            if idx >= offset:
+                tdec = self.time_decoder[idx - offset]
+                length_t = lengths_t.pop(-1)
+                if tdec.empty:
+                    if pre.shape[2] != 1:
+                        raise ValueError(f"If tdec empty is True, pre shape does not match {pre.shape}")
+                    pre = pre[:, :, 0]
+                    xt, _ = tdec(pre, None, length_t)
+                else:
+                    skip = saved_t.pop(-1)
+                    xt, _ = tdec(xt, skip, length_t)
+
+        if len(saved) != 0:
+            raise AssertionError("saved is not empty")
+        if len(lengths_t) != 0:
+            raise AssertionError("lengths_t is not empty")
+        if len(saved_t) != 0:
+            raise AssertionError("saved_t is not empty")
+
+        S = len(self.sources)
+        x = x.view(B, S, -1, Fq, T)
+        x = x * std[:, None] + mean[:, None]
+
+        zout = self._mask(x)
+        x = self._ispec(zout, length)
+
+        xt = xt.view(B, S, -1, length)
+        xt = xt * stdt[:, None] + meant[:, None]
+        x = xt + x
+        return x
+
+
+class _DConv(torch.nn.Module):
+    r"""
+    New residual branches in each encoder layer.
+    This alternates dilated convolutions, potentially with LSTMs and attention.
+    Also before entering each residual branch, dimension is projected on a smaller subspace,
+    e.g. of dim `channels // compress`.
+
+    Args:
+        channels (int): input/output channels for residual branch.
+        compress (float, optional): amount of channel compression inside the branch. (default: 4)
+        depth (int, optional): number of layers in the residual branch. Each layer has its own
+            projection, and potentially LSTM and attention.(default: 2)
+        init (float, optional): initial scale for LayerNorm. (default: 1e-4)
+        norm_type (bool, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
+        attn (bool, optional): use LocalAttention. (Default: ``False``)
+        heads (int, optional): number of heads for the LocalAttention.  (default: 4)
+        ndecay (int, optional): number of decay controls in the LocalAttention. (default: 4)
+        lstm (bool, optional): use LSTM. (Default: ``False``)
+        kernel_size (int, optional): kernel size for the (dilated) convolutions. (default: 3)
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        compress: float = 4,
+        depth: int = 2,
+        init: float = 1e-4,
+        norm_type: str = "group_norm",
+        attn: bool = False,
+        heads: int = 4,
+        ndecay: int = 4,
+        lstm: bool = False,
+        kernel_size: int = 3,
+    ):
+
+        super().__init__()
+        if kernel_size % 2 == 0:
+            raise ValueError("Kernel size should not be divisible by 2")
+        self.channels = channels
+        self.compress = compress
+        self.depth = abs(depth)
+        dilate = depth > 0
+
+        norm_fn: tp.Callable[[int], nn.Module]
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm_type == "group_norm":
+            norm_fn = lambda d: nn.GroupNorm(1, d)  # noqa
+
+        hidden = int(channels / compress)
+
+        act = nn.GELU
+
+        self.layers = nn.ModuleList([])
+        for d in range(self.depth):
+            dilation = pow(2, d) if dilate else 1
+            padding = dilation * (kernel_size // 2)
+            mods = [
+                nn.Conv1d(channels, hidden, kernel_size, dilation=dilation, padding=padding),
+                norm_fn(hidden),
+                act(),
+                nn.Conv1d(hidden, 2 * channels, 1),
+                norm_fn(2 * channels),
+                nn.GLU(1),
+                _LayerScale(channels, init),
+            ]
+            if attn:
+                mods.insert(3, _LocalState(hidden, heads=heads, ndecay=ndecay))
+            if lstm:
+                mods.insert(3, _BLSTM(hidden, layers=2, skip=True))
+            layer = nn.Sequential(*mods)
+            self.layers.append(layer)
+
+    def forward(self, x):
+        r"""DConv forward call
+
+        Args:
+            x (torch.Tensor): input tensor for convolution
+
+        Returns:
+            Tensor
+                Output after being run through layers.
+        """
+        for layer in self.layers:
+            x = x + layer(x)
+        return x
+
+
+class _BLSTM(torch.nn.Module):
+    r"""
+    BiLSTM with same hidden units as input dim.
+    If `max_steps` is not None, input will be splitting in overlapping
+    chunks and the LSTM applied separately on each chunk.
+    Args:
+        dim (int): dimensions at LSTM layer.
+        layers (int, optional): number of LSTM layers. (default: 1)
+        skip (bool, optional): (default: ``False``)
+    """
+
+    def __init__(self, dim, layers: int = 1, skip: bool = False):
+        super().__init__()
+        self.max_steps = 200
+        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
+        self.linear = nn.Linear(2 * dim, dim)
+        self.skip = skip
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""BLSTM forward call
+
+        Args:
+            x (torch.Tensor): input tensor for BLSTM shape is `(batch_size, dim, time_steps)`
+
+        Returns:
+            Tensor
+                Output after being run through bidirectional LSTM. Shape is `(batch_size, dim, time_steps)`
+        """
+        B, C, T = x.shape
+        y = x
+        framed = False
+        width = 0
+        stride = 0
+        nframes = 0
+        if self.max_steps is not None and T > self.max_steps:
+            width = self.max_steps
+            stride = width // 2
+            frames = _unfold(x, width, stride)
+            nframes = frames.shape[2]
+            framed = True
+            x = frames.permute(0, 2, 1, 3).reshape(-1, C, width)
+
+        x = x.permute(2, 0, 1)
+
+        x = self.lstm(x)[0]
+        x = self.linear(x)
+        x = x.permute(1, 2, 0)
+        if framed:
+            out = []
+            frames = x.reshape(B, -1, C, width)
+            limit = stride // 2
+            for k in range(nframes):
+                if k == 0:
+                    out.append(frames[:, k, :, :-limit])
+                elif k == nframes - 1:
+                    out.append(frames[:, k, :, limit:])
+                else:
+                    out.append(frames[:, k, :, limit:-limit])
+            out = torch.cat(out, -1)
+            out = out[..., :T]
+            x = out
+        if self.skip:
+            x = x + y
+
+        return x
+
+
+class _LocalState(nn.Module):
+    """Local state allows to have attention based only on data (no positional embedding),
+    but while setting a constraint on the time window (e.g. decaying penalty term).
+    Also a failed experiments with trying to provide some frequency based attention.
+    """
+
+    def __init__(self, channels: int, heads: int = 4, ndecay: int = 4):
+        r"""
+        Args:
+            channels (int): Size of Conv1d layers.
+            heads (int, optional):  (default: 4)
+            ndecay (int, optional): (default: 4)
+        """
+        super(_LocalState, self).__init__()
+        if channels % heads != 0:
+            raise ValueError("Channels must be divisible by heads.")
+        self.heads = heads
+        self.ndecay = ndecay
+        self.content = nn.Conv1d(channels, channels, 1)
+        self.query = nn.Conv1d(channels, channels, 1)
+        self.key = nn.Conv1d(channels, channels, 1)
+
+        self.query_decay = nn.Conv1d(channels, heads * ndecay, 1)
+        if ndecay:
+            # Initialize decay close to zero (there is a sigmoid), for maximum initial window.
+            self.query_decay.weight.data *= 0.01
+            if self.query_decay.bias is None:
+                raise ValueError("bias must not be None.")
+            self.query_decay.bias.data[:] = -2
+        self.proj = nn.Conv1d(channels + heads * 0, channels, 1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""LocalState forward call
+
+        Args:
+            x (torch.Tensor): input tensor for LocalState
+
+        Returns:
+            Tensor
+                Output after being run through LocalState layer.
+        """
+        B, C, T = x.shape
+        heads = self.heads
+        indexes = torch.arange(T, device=x.device, dtype=x.dtype)
+        # left index are keys, right index are queries
+        delta = indexes[:, None] - indexes[None, :]
+
+        queries = self.query(x).view(B, heads, -1, T)
+        keys = self.key(x).view(B, heads, -1, T)
+        # t are keys, s are queries
+        dots = torch.einsum("bhct,bhcs->bhts", keys, queries)
+        dots /= math.sqrt(keys.shape[2])
+        if self.ndecay:
+            decays = torch.arange(1, self.ndecay + 1, device=x.device, dtype=x.dtype)
+            decay_q = self.query_decay(x).view(B, heads, -1, T)
+            decay_q = torch.sigmoid(decay_q) / 2
+            decay_kernel = -decays.view(-1, 1, 1) * delta.abs() / math.sqrt(self.ndecay)
+            dots += torch.einsum("fts,bhfs->bhts", decay_kernel, decay_q)
+
+        # Kill self reference.
+        dots.masked_fill_(torch.eye(T, device=dots.device, dtype=torch.bool), -100)
+        weights = torch.softmax(dots, dim=2)
+
+        content = self.content(x).view(B, heads, -1, T)
+        result = torch.einsum("bhts,bhct->bhcs", weights, content)
+        result = result.reshape(B, -1, T)
+        return x + self.proj(result)
+
+
+class _LayerScale(nn.Module):
+    """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonally residual outputs close to 0 initially, then learnt.
+    """
+
+    def __init__(self, channels: int, init: float = 0):
+        r"""
+        Args:
+            channels (int): Size of  rescaling
+            init (float, optional): Scale to default to (default: 0)
+        """
+        super().__init__()
+        self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True))
+        self.scale.data[:] = init
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""LayerScale forward call
+
+        Args:
+            x (torch.Tensor): input tensor for LayerScale
+
+        Returns:
+            Tensor
+                Output after rescaling tensor.
+        """
+        return self.scale[:, None] * x
+
+
+def _unfold(a: torch.Tensor, kernel_size: int, stride: int) -> torch.Tensor:
+    """Given input of size [*OT, T], output Tensor of size [*OT, F, K]
+    with K the kernel size, by extracting frames with the given stride.
+    This will pad the input so that `F = ceil(T / K)`.
+    see https://github.com/pytorch/pytorch/issues/60466
+    """
+    shape = list(a.shape[:-1])
+    length = int(a.shape[-1])
+    n_frames = math.ceil(length / stride)
+    tgt_length = (n_frames - 1) * stride + kernel_size
+    a = F.pad(input=a, pad=[0, tgt_length - length])
+    strides = [a.stride(dim) for dim in range(a.dim())]
+    if strides[-1] != 1:
+        raise ValueError("Data should be contiguous.")
+    strides = strides[:-1] + [stride, 1]
+    shape.append(n_frames)
+    shape.append(kernel_size)
+    return a.as_strided(shape, strides)
+
+
+def _rescale_module(module):
+    r"""
+    Rescales initial weight scale for all models within the module.
+    """
+    for sub in module.modules():
+        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d, nn.Conv2d, nn.ConvTranspose2d)):
+            std = sub.weight.std().detach()
+            scale = (std / 0.1) ** 0.5
+            sub.weight.data /= scale
+            if sub.bias is not None:
+                sub.bias.data /= scale
+
+
+def _spectro(x: torch.Tensor, n_fft: int = 512, hop_length: int = 0, pad: int = 0) -> torch.Tensor:
+    other = list(x.shape[:-1])
+    length = int(x.shape[-1])
+    x = x.reshape(-1, length)
+    z = torch.stft(
+        x,
+        n_fft * (1 + pad),
+        hop_length,
+        window=torch.hann_window(n_fft).to(x),
+        win_length=n_fft,
+        normalized=True,
+        center=True,
+        return_complex=True,
+        pad_mode="reflect",
+    )
+    _, freqs, frame = z.shape
+    other.extend([freqs, frame])
+    return z.view(other)
+
+
+def _ispectro(z: torch.Tensor, hop_length: int = 0, length: int = 0, pad: int = 0) -> torch.Tensor:
+    other = list(z.shape[:-2])
+    freqs = int(z.shape[-2])
+    frames = int(z.shape[-1])
+
+    n_fft = 2 * freqs - 2
+    z = z.view(-1, freqs, frames)
+    win_length = n_fft // (1 + pad)
+    x = torch.istft(
+        z,
+        n_fft,
+        hop_length,
+        window=torch.hann_window(win_length).to(z.real),
+        win_length=win_length,
+        normalized=True,
+        length=length,
+        center=True,
+    )
+    _, length = x.shape
+    other.append(length)
+    return x.view(other)
+
+
+def hdemucs_low(sources: List[str]) -> HDemucs:
+    """Builds low nfft (1024) version of :class:`HDemucs`, suitable for sample rates around 8 kHz.
+
+    Args:
+        sources (List[str]): See :py:func:`HDemucs`.
+
+    Returns:
+        HDemucs:
+            HDemucs model.
+    """
+
+    return HDemucs(sources=sources, nfft=1024, depth=5)
+
+
+def hdemucs_medium(sources: List[str]) -> HDemucs:
+    r"""Builds medium nfft (2048) version of :class:`HDemucs`, suitable for sample rates of 16-32 kHz.
+
+    .. note::
+
+        Medium HDemucs has not been tested against the original Hybrid Demucs as this nfft and depth configuration is
+        not compatible with the original implementation in https://github.com/facebookresearch/demucs
+
+    Args:
+        sources (List[str]): See :py:func:`HDemucs`.
+
+    Returns:
+        HDemucs:
+            HDemucs model.
+    """
+
+    return HDemucs(sources=sources, nfft=2048, depth=6)
+
+
+def hdemucs_high(sources: List[str]) -> HDemucs:
+    r"""Builds medium nfft (4096) version of :class:`HDemucs`, suitable for sample rates of 44.1-48 kHz.
+
+    Args:
+        sources (List[str]): See :py:func:`HDemucs`.
+
+    Returns:
+        HDemucs:
+            HDemucs model.
+    """
+
+    return HDemucs(sources=sources, nfft=4096, depth=6)
diff --git a/MLPY/Lib/site-packages/torchaudio/models/conformer.py b/MLPY/Lib/site-packages/torchaudio/models/conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaf3d6724112397853f18cb742acbdc9b691bd4a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/conformer.py
@@ -0,0 +1,293 @@
+from typing import Optional, Tuple
+
+import torch
+
+
+__all__ = ["Conformer"]
+
+
+def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor:
+    batch_size = lengths.shape[0]
+    max_length = int(torch.max(lengths).item())
+    padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand(
+        batch_size, max_length
+    ) >= lengths.unsqueeze(1)
+    return padding_mask
+
+
+class _ConvolutionModule(torch.nn.Module):
+    r"""Conformer convolution module.
+
+    Args:
+        input_dim (int): input dimension.
+        num_channels (int): number of depthwise convolution layer input channels.
+        depthwise_kernel_size (int): kernel size of depthwise convolution layer.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``)
+        use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_channels: int,
+        depthwise_kernel_size: int,
+        dropout: float = 0.0,
+        bias: bool = False,
+        use_group_norm: bool = False,
+    ) -> None:
+        super().__init__()
+        if (depthwise_kernel_size - 1) % 2 != 0:
+            raise ValueError("depthwise_kernel_size must be odd to achieve 'SAME' padding.")
+        self.layer_norm = torch.nn.LayerNorm(input_dim)
+        self.sequential = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                input_dim,
+                2 * num_channels,
+                1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            ),
+            torch.nn.GLU(dim=1),
+            torch.nn.Conv1d(
+                num_channels,
+                num_channels,
+                depthwise_kernel_size,
+                stride=1,
+                padding=(depthwise_kernel_size - 1) // 2,
+                groups=num_channels,
+                bias=bias,
+            ),
+            torch.nn.GroupNorm(num_groups=1, num_channels=num_channels)
+            if use_group_norm
+            else torch.nn.BatchNorm1d(num_channels),
+            torch.nn.SiLU(),
+            torch.nn.Conv1d(
+                num_channels,
+                input_dim,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            ),
+            torch.nn.Dropout(dropout),
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            input (torch.Tensor): with shape `(B, T, D)`.
+
+        Returns:
+            torch.Tensor: output, with shape `(B, T, D)`.
+        """
+        x = self.layer_norm(input)
+        x = x.transpose(1, 2)
+        x = self.sequential(x)
+        return x.transpose(1, 2)
+
+
+class _FeedForwardModule(torch.nn.Module):
+    r"""Positionwise feed forward layer.
+
+    Args:
+        input_dim (int): input dimension.
+        hidden_dim (int): hidden dimension.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+    """
+
+    def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None:
+        super().__init__()
+        self.sequential = torch.nn.Sequential(
+            torch.nn.LayerNorm(input_dim),
+            torch.nn.Linear(input_dim, hidden_dim, bias=True),
+            torch.nn.SiLU(),
+            torch.nn.Dropout(dropout),
+            torch.nn.Linear(hidden_dim, input_dim, bias=True),
+            torch.nn.Dropout(dropout),
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            input (torch.Tensor): with shape `(*, D)`.
+
+        Returns:
+            torch.Tensor: output, with shape `(*, D)`.
+        """
+        return self.sequential(input)
+
+
+class ConformerLayer(torch.nn.Module):
+    r"""Conformer layer that constitutes Conformer.
+
+    Args:
+        input_dim (int): input dimension.
+        ffn_dim (int): hidden layer dimension of feedforward network.
+        num_attention_heads (int): number of attention heads.
+        depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
+            in the convolution module. (Default: ``False``)
+        convolution_first (bool, optional): apply the convolution module ahead of
+            the attention module. (Default: ``False``)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        ffn_dim: int,
+        num_attention_heads: int,
+        depthwise_conv_kernel_size: int,
+        dropout: float = 0.0,
+        use_group_norm: bool = False,
+        convolution_first: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout)
+
+        self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim)
+        self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout)
+        self.self_attn_dropout = torch.nn.Dropout(dropout)
+
+        self.conv_module = _ConvolutionModule(
+            input_dim=input_dim,
+            num_channels=input_dim,
+            depthwise_kernel_size=depthwise_conv_kernel_size,
+            dropout=dropout,
+            bias=True,
+            use_group_norm=use_group_norm,
+        )
+
+        self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout)
+        self.final_layer_norm = torch.nn.LayerNorm(input_dim)
+        self.convolution_first = convolution_first
+
+    def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor:
+        residual = input
+        input = input.transpose(0, 1)
+        input = self.conv_module(input)
+        input = input.transpose(0, 1)
+        input = residual + input
+        return input
+
+    def forward(self, input: torch.Tensor, key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        r"""
+        Args:
+            input (torch.Tensor): input, with shape `(T, B, D)`.
+            key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer.
+
+        Returns:
+            torch.Tensor: output, with shape `(T, B, D)`.
+        """
+        residual = input
+        x = self.ffn1(input)
+        x = x * 0.5 + residual
+
+        if self.convolution_first:
+            x = self._apply_convolution(x)
+
+        residual = x
+        x = self.self_attn_layer_norm(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )
+        x = self.self_attn_dropout(x)
+        x = x + residual
+
+        if not self.convolution_first:
+            x = self._apply_convolution(x)
+
+        residual = x
+        x = self.ffn2(x)
+        x = x * 0.5 + residual
+
+        x = self.final_layer_norm(x)
+        return x
+
+
+class Conformer(torch.nn.Module):
+    r"""Conformer architecture introduced in
+    *Conformer: Convolution-augmented Transformer for Speech Recognition*
+    :cite:`gulati2020conformer`.
+
+    Args:
+        input_dim (int): input dimension.
+        num_heads (int): number of attention heads in each Conformer layer.
+        ffn_dim (int): hidden layer dimension of feedforward networks.
+        num_layers (int): number of Conformer layers to instantiate.
+        depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
+            in the convolution module. (Default: ``False``)
+        convolution_first (bool, optional): apply the convolution module ahead of
+            the attention module. (Default: ``False``)
+
+    Examples:
+        >>> conformer = Conformer(
+        >>>     input_dim=80,
+        >>>     num_heads=4,
+        >>>     ffn_dim=128,
+        >>>     num_layers=4,
+        >>>     depthwise_conv_kernel_size=31,
+        >>> )
+        >>> lengths = torch.randint(1, 400, (10,))  # (batch,)
+        >>> input = torch.rand(10, int(lengths.max()), input_dim)  # (batch, num_frames, input_dim)
+        >>> output = conformer(input, lengths)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        num_layers: int,
+        depthwise_conv_kernel_size: int,
+        dropout: float = 0.0,
+        use_group_norm: bool = False,
+        convolution_first: bool = False,
+    ):
+        super().__init__()
+
+        self.conformer_layers = torch.nn.ModuleList(
+            [
+                ConformerLayer(
+                    input_dim,
+                    ffn_dim,
+                    num_heads,
+                    depthwise_conv_kernel_size,
+                    dropout=dropout,
+                    use_group_norm=use_group_norm,
+                    convolution_first=convolution_first,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Args:
+            input (torch.Tensor): with shape `(B, T, input_dim)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor)
+                torch.Tensor
+                    output frames, with shape `(B, T, input_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid frames for i-th batch element in output frames.
+        """
+        encoder_padding_mask = _lengths_to_padding_mask(lengths)
+
+        x = input.transpose(0, 1)
+        for layer in self.conformer_layers:
+            x = layer(x, encoder_padding_mask)
+        return x.transpose(0, 1), lengths
diff --git a/MLPY/Lib/site-packages/torchaudio/models/conv_tasnet.py b/MLPY/Lib/site-packages/torchaudio/models/conv_tasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f570a32c9e4c44fa48decac6ea9f67284395820
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/conv_tasnet.py
@@ -0,0 +1,330 @@
+"""Implements Conv-TasNet with building blocks of it.
+
+Based on https://github.com/naplab/Conv-TasNet/tree/e66d82a8f956a69749ec8a4ae382217faa097c5c
+"""
+
+from typing import Optional, Tuple
+
+import torch
+
+
+class ConvBlock(torch.nn.Module):
+    """1D Convolutional block.
+
+    Args:
+        io_channels (int): The number of input/output channels, <B, Sc>
+        hidden_channels (int): The number of channels in the internal layers, <H>.
+        kernel_size (int): The convolution kernel size of the middle layer, <P>.
+        padding (int): Padding value of the convolution in the middle layer.
+        dilation (int, optional): Dilation value of the convolution in the middle layer.
+        no_redisual (bool, optional): Disable residual block/output.
+
+    Note:
+        This implementation corresponds to the "non-causal" setting in the paper.
+    """
+
+    def __init__(
+        self,
+        io_channels: int,
+        hidden_channels: int,
+        kernel_size: int,
+        padding: int,
+        dilation: int = 1,
+        no_residual: bool = False,
+    ):
+        super().__init__()
+
+        self.conv_layers = torch.nn.Sequential(
+            torch.nn.Conv1d(in_channels=io_channels, out_channels=hidden_channels, kernel_size=1),
+            torch.nn.PReLU(),
+            torch.nn.GroupNorm(num_groups=1, num_channels=hidden_channels, eps=1e-08),
+            torch.nn.Conv1d(
+                in_channels=hidden_channels,
+                out_channels=hidden_channels,
+                kernel_size=kernel_size,
+                padding=padding,
+                dilation=dilation,
+                groups=hidden_channels,
+            ),
+            torch.nn.PReLU(),
+            torch.nn.GroupNorm(num_groups=1, num_channels=hidden_channels, eps=1e-08),
+        )
+
+        self.res_out = (
+            None
+            if no_residual
+            else torch.nn.Conv1d(in_channels=hidden_channels, out_channels=io_channels, kernel_size=1)
+        )
+        self.skip_out = torch.nn.Conv1d(in_channels=hidden_channels, out_channels=io_channels, kernel_size=1)
+
+    def forward(self, input: torch.Tensor) -> Tuple[Optional[torch.Tensor], torch.Tensor]:
+        feature = self.conv_layers(input)
+        if self.res_out is None:
+            residual = None
+        else:
+            residual = self.res_out(feature)
+        skip_out = self.skip_out(feature)
+        return residual, skip_out
+
+
+class MaskGenerator(torch.nn.Module):
+    """TCN (Temporal Convolution Network) Separation Module
+
+    Generates masks for separation.
+
+    Args:
+        input_dim (int): Input feature dimension, <N>.
+        num_sources (int): The number of sources to separate.
+        kernel_size (int): The convolution kernel size of conv blocks, <P>.
+        num_featrs (int): Input/output feature dimenstion of conv blocks, <B, Sc>.
+        num_hidden (int): Intermediate feature dimention of conv blocks, <H>
+        num_layers (int): The number of conv blocks in one stack, <X>.
+        num_stacks (int): The number of conv block stacks, <R>.
+        msk_activate (str): The activation function of the mask output.
+
+    Note:
+        This implementation corresponds to the "non-causal" setting in the paper.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_sources: int,
+        kernel_size: int,
+        num_feats: int,
+        num_hidden: int,
+        num_layers: int,
+        num_stacks: int,
+        msk_activate: str,
+    ):
+        super().__init__()
+
+        self.input_dim = input_dim
+        self.num_sources = num_sources
+
+        self.input_norm = torch.nn.GroupNorm(num_groups=1, num_channels=input_dim, eps=1e-8)
+        self.input_conv = torch.nn.Conv1d(in_channels=input_dim, out_channels=num_feats, kernel_size=1)
+
+        self.receptive_field = 0
+        self.conv_layers = torch.nn.ModuleList([])
+        for s in range(num_stacks):
+            for l in range(num_layers):
+                multi = 2**l
+                self.conv_layers.append(
+                    ConvBlock(
+                        io_channels=num_feats,
+                        hidden_channels=num_hidden,
+                        kernel_size=kernel_size,
+                        dilation=multi,
+                        padding=multi,
+                        # The last ConvBlock does not need residual
+                        no_residual=(l == (num_layers - 1) and s == (num_stacks - 1)),
+                    )
+                )
+                self.receptive_field += kernel_size if s == 0 and l == 0 else (kernel_size - 1) * multi
+        self.output_prelu = torch.nn.PReLU()
+        self.output_conv = torch.nn.Conv1d(
+            in_channels=num_feats,
+            out_channels=input_dim * num_sources,
+            kernel_size=1,
+        )
+        if msk_activate == "sigmoid":
+            self.mask_activate = torch.nn.Sigmoid()
+        elif msk_activate == "relu":
+            self.mask_activate = torch.nn.ReLU()
+        else:
+            raise ValueError(f"Unsupported activation {msk_activate}")
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        """Generate separation mask.
+
+        Args:
+            input (torch.Tensor): 3D Tensor with shape [batch, features, frames]
+
+        Returns:
+            Tensor: shape [batch, num_sources, features, frames]
+        """
+        batch_size = input.shape[0]
+        feats = self.input_norm(input)
+        feats = self.input_conv(feats)
+        output = 0.0
+        for layer in self.conv_layers:
+            residual, skip = layer(feats)
+            if residual is not None:  # the last conv layer does not produce residual
+                feats = feats + residual
+            output = output + skip
+        output = self.output_prelu(output)
+        output = self.output_conv(output)
+        output = self.mask_activate(output)
+        return output.view(batch_size, self.num_sources, self.input_dim, -1)
+
+
+class ConvTasNet(torch.nn.Module):
+    """Conv-TasNet architecture introduced in
+    *Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation*
+    :cite:`Luo_2019`.
+
+    Note:
+        This implementation corresponds to the "non-causal" setting in the paper.
+
+    See Also:
+        * :class:`torchaudio.pipelines.SourceSeparationBundle`: Source separation pipeline with pre-trained models.
+
+    Args:
+        num_sources (int, optional): The number of sources to split.
+        enc_kernel_size (int, optional): The convolution kernel size of the encoder/decoder, <L>.
+        enc_num_feats (int, optional): The feature dimensions passed to mask generator, <N>.
+        msk_kernel_size (int, optional): The convolution kernel size of the mask generator, <P>.
+        msk_num_feats (int, optional): The input/output feature dimension of conv block in the mask generator, <B, Sc>.
+        msk_num_hidden_feats (int, optional): The internal feature dimension of conv block of the mask generator, <H>.
+        msk_num_layers (int, optional): The number of layers in one conv block of the mask generator, <X>.
+        msk_num_stacks (int, optional): The numbr of conv blocks of the mask generator, <R>.
+        msk_activate (str, optional): The activation function of the mask output (Default: ``sigmoid``).
+    """
+
+    def __init__(
+        self,
+        num_sources: int = 2,
+        # encoder/decoder parameters
+        enc_kernel_size: int = 16,
+        enc_num_feats: int = 512,
+        # mask generator parameters
+        msk_kernel_size: int = 3,
+        msk_num_feats: int = 128,
+        msk_num_hidden_feats: int = 512,
+        msk_num_layers: int = 8,
+        msk_num_stacks: int = 3,
+        msk_activate: str = "sigmoid",
+    ):
+        super().__init__()
+
+        self.num_sources = num_sources
+        self.enc_num_feats = enc_num_feats
+        self.enc_kernel_size = enc_kernel_size
+        self.enc_stride = enc_kernel_size // 2
+
+        self.encoder = torch.nn.Conv1d(
+            in_channels=1,
+            out_channels=enc_num_feats,
+            kernel_size=enc_kernel_size,
+            stride=self.enc_stride,
+            padding=self.enc_stride,
+            bias=False,
+        )
+        self.mask_generator = MaskGenerator(
+            input_dim=enc_num_feats,
+            num_sources=num_sources,
+            kernel_size=msk_kernel_size,
+            num_feats=msk_num_feats,
+            num_hidden=msk_num_hidden_feats,
+            num_layers=msk_num_layers,
+            num_stacks=msk_num_stacks,
+            msk_activate=msk_activate,
+        )
+        self.decoder = torch.nn.ConvTranspose1d(
+            in_channels=enc_num_feats,
+            out_channels=1,
+            kernel_size=enc_kernel_size,
+            stride=self.enc_stride,
+            padding=self.enc_stride,
+            bias=False,
+        )
+
+    def _align_num_frames_with_strides(self, input: torch.Tensor) -> Tuple[torch.Tensor, int]:
+        """Pad input Tensor so that the end of the input tensor corresponds with
+
+        1. (if kernel size is odd) the center of the last convolution kernel
+        or 2. (if kernel size is even) the end of the first half of the last convolution kernel
+
+        Assumption:
+            The resulting Tensor will be padded with the size of stride (== kernel_width // 2)
+            on the both ends in Conv1D
+
+        |<--- k_1 --->|
+        |      |            |<-- k_n-1 -->|
+        |      |                  |  |<--- k_n --->|
+        |      |                  |         |      |
+        |      |                  |         |      |
+        |      v                  v         v      |
+        |<---->|<--- input signal --->|<--->|<---->|
+         stride                         PAD  stride
+
+        Args:
+            input (torch.Tensor): 3D Tensor with shape (batch_size, channels==1, frames)
+
+        Returns:
+            Tensor: Padded Tensor
+            int: Number of paddings performed
+        """
+        batch_size, num_channels, num_frames = input.shape
+        is_odd = self.enc_kernel_size % 2
+        num_strides = (num_frames - is_odd) // self.enc_stride
+        num_remainings = num_frames - (is_odd + num_strides * self.enc_stride)
+        if num_remainings == 0:
+            return input, 0
+
+        num_paddings = self.enc_stride - num_remainings
+        pad = torch.zeros(
+            batch_size,
+            num_channels,
+            num_paddings,
+            dtype=input.dtype,
+            device=input.device,
+        )
+        return torch.cat([input, pad], 2), num_paddings
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        """Perform source separation. Generate audio source waveforms.
+
+        Args:
+            input (torch.Tensor): 3D Tensor with shape [batch, channel==1, frames]
+
+        Returns:
+            Tensor: 3D Tensor with shape [batch, channel==num_sources, frames]
+        """
+        if input.ndim != 3 or input.shape[1] != 1:
+            raise ValueError(f"Expected 3D tensor (batch, channel==1, frames). Found: {input.shape}")
+
+        # B: batch size
+        # L: input frame length
+        # L': padded input frame length
+        # F: feature dimension
+        # M: feature frame length
+        # S: number of sources
+
+        padded, num_pads = self._align_num_frames_with_strides(input)  # B, 1, L'
+        batch_size, num_padded_frames = padded.shape[0], padded.shape[2]
+        feats = self.encoder(padded)  # B, F, M
+        masked = self.mask_generator(feats) * feats.unsqueeze(1)  # B, S, F, M
+        masked = masked.view(batch_size * self.num_sources, self.enc_num_feats, -1)  # B*S, F, M
+        decoded = self.decoder(masked)  # B*S, 1, L'
+        output = decoded.view(batch_size, self.num_sources, num_padded_frames)  # B, S, L'
+        if num_pads > 0:
+            output = output[..., :-num_pads]  # B, S, L
+        return output
+
+
+def conv_tasnet_base(num_sources: int = 2) -> ConvTasNet:
+    r"""Builds non-causal version of :class:`~torchaudio.models.ConvTasNet`.
+
+    The parameter settings follow the ones with the highest Si-SNR metirc score in the paper,
+    except the mask activation function is changed from "sigmoid" to "relu" for performance improvement.
+
+    Args:
+        num_sources (int, optional): Number of sources in the output.
+            (Default: 2)
+    Returns:
+        ConvTasNet:
+            ConvTasNet model.
+    """
+    return ConvTasNet(
+        num_sources=num_sources,
+        enc_kernel_size=16,
+        enc_num_feats=512,
+        msk_kernel_size=3,
+        msk_num_feats=128,
+        msk_num_hidden_feats=512,
+        msk_num_layers=8,
+        msk_num_stacks=3,
+        msk_activate="relu",
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/models/decoder/__init__.py b/MLPY/Lib/site-packages/torchaudio/models/decoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b13d3b9e3567347ab494fddd3ee2b0106fec22a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/decoder/__init__.py
@@ -0,0 +1,46 @@
+_CTC_DECODERS = [
+    "CTCHypothesis",
+    "CTCDecoder",
+    "CTCDecoderLM",
+    "CTCDecoderLMState",
+    "ctc_decoder",
+    "download_pretrained_files",
+]
+_CUDA_CTC_DECODERS = [
+    "CUCTCDecoder",
+    "CUCTCHypothesis",
+    "cuda_ctc_decoder",
+]
+
+
+def __getattr__(name: str):
+    if name in _CTC_DECODERS:
+        try:
+            from . import _ctc_decoder
+        except Exception as err:
+            raise RuntimeError(
+                "CTC Decoder suit requires flashlight-text package and optionally KenLM. Please install them."
+            ) from err
+
+        item = getattr(_ctc_decoder, name)
+        globals()[name] = item
+        return item
+    elif name in _CUDA_CTC_DECODERS:
+        try:
+            from . import _cuda_ctc_decoder
+        except AttributeError as err:
+            raise RuntimeError(
+                "To use CUCTC decoder, please set BUILD_CUDA_CTC_DECODER=1 when building from source."
+            ) from err
+
+        item = getattr(_cuda_ctc_decoder, name)
+        globals()[name] = item
+        return item
+    raise AttributeError(f"module {__name__} has no attribute {name}")
+
+
+def __dir__():
+    return sorted(__all__)
+
+
+__all__ = _CTC_DECODERS + _CUDA_CTC_DECODERS
diff --git a/MLPY/Lib/site-packages/torchaudio/models/decoder/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/decoder/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07527eb57d9705989d8282a8b9ca30caa1bfb1f2
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/decoder/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/decoder/__pycache__/_ctc_decoder.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/decoder/__pycache__/_ctc_decoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20988170f7b1b08f334b5f8dbc9b061b4d186244
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/decoder/__pycache__/_ctc_decoder.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/decoder/__pycache__/_cuda_ctc_decoder.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/decoder/__pycache__/_cuda_ctc_decoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7702b880abcbc52663c8dccf555e1b5911b3bd42
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/decoder/__pycache__/_cuda_ctc_decoder.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/decoder/_ctc_decoder.py b/MLPY/Lib/site-packages/torchaudio/models/decoder/_ctc_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c379781f64d54a3b826d953d6cfd4de22051695f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/decoder/_ctc_decoder.py
@@ -0,0 +1,568 @@
+from __future__ import annotations
+
+import itertools as it
+
+from abc import abstractmethod
+from collections import namedtuple
+from typing import Dict, List, NamedTuple, Optional, Tuple, Union
+
+import torch
+
+from flashlight.lib.text.decoder import (
+    CriterionType as _CriterionType,
+    LexiconDecoder as _LexiconDecoder,
+    LexiconDecoderOptions as _LexiconDecoderOptions,
+    LexiconFreeDecoder as _LexiconFreeDecoder,
+    LexiconFreeDecoderOptions as _LexiconFreeDecoderOptions,
+    LM as _LM,
+    LMState as _LMState,
+    SmearingMode as _SmearingMode,
+    Trie as _Trie,
+    ZeroLM as _ZeroLM,
+)
+from flashlight.lib.text.dictionary import (
+    create_word_dict as _create_word_dict,
+    Dictionary as _Dictionary,
+    load_words as _load_words,
+)
+from torchaudio.utils import download_asset
+
+try:
+    from flashlight.lib.text.decoder.kenlm import KenLM as _KenLM
+except Exception:
+    try:
+        from flashlight.lib.text.decoder import KenLM as _KenLM
+    except Exception:
+        _KenLM = None
+
+__all__ = [
+    "CTCHypothesis",
+    "CTCDecoder",
+    "CTCDecoderLM",
+    "CTCDecoderLMState",
+    "ctc_decoder",
+    "download_pretrained_files",
+]
+
+_PretrainedFiles = namedtuple("PretrainedFiles", ["lexicon", "tokens", "lm"])
+
+
+def _construct_trie(tokens_dict, word_dict, lexicon, lm, silence):
+    vocab_size = tokens_dict.index_size()
+    trie = _Trie(vocab_size, silence)
+    start_state = lm.start(False)
+
+    for word, spellings in lexicon.items():
+        word_idx = word_dict.get_index(word)
+        _, score = lm.score(start_state, word_idx)
+        for spelling in spellings:
+            spelling_idx = [tokens_dict.get_index(token) for token in spelling]
+            trie.insert(spelling_idx, word_idx, score)
+    trie.smear(_SmearingMode.MAX)
+    return trie
+
+
+def _get_word_dict(lexicon, lm, lm_dict, tokens_dict, unk_word):
+    word_dict = None
+    if lm_dict is not None:
+        word_dict = _Dictionary(lm_dict)
+
+    if lexicon and word_dict is None:
+        word_dict = _create_word_dict(lexicon)
+    elif not lexicon and word_dict is None and type(lm) == str:
+        d = {tokens_dict.get_entry(i): [[tokens_dict.get_entry(i)]] for i in range(tokens_dict.index_size())}
+        d[unk_word] = [[unk_word]]
+        word_dict = _create_word_dict(d)
+
+    return word_dict
+
+
+class CTCHypothesis(NamedTuple):
+    r"""Represents hypothesis generated by CTC beam search decoder :class:`CTCDecoder`."""
+    tokens: torch.LongTensor
+    """Predicted sequence of token IDs. Shape `(L, )`, where `L` is the length of the output sequence"""
+
+    words: List[str]
+    """List of predicted words.
+
+    Note:
+        This attribute is only applicable if a lexicon is provided to the decoder. If
+        decoding without a lexicon, it will be blank. Please refer to :attr:`tokens` and
+        :func:`~torchaudio.models.decoder.CTCDecoder.idxs_to_tokens` instead.
+    """
+
+    score: float
+    """Score corresponding to hypothesis"""
+
+    timesteps: torch.IntTensor
+    """Timesteps corresponding to the tokens. Shape `(L, )`, where `L` is the length of the output sequence"""
+
+
+class CTCDecoderLMState(_LMState):
+    """Language model state."""
+
+    @property
+    def children(self) -> Dict[int, CTCDecoderLMState]:
+        """Map of indices to LM states"""
+        return super().children
+
+    def child(self, usr_index: int) -> CTCDecoderLMState:
+        """Returns child corresponding to usr_index, or creates and returns a new state if input index
+        is not found.
+
+        Args:
+            usr_index (int): index corresponding to child state
+
+        Returns:
+            CTCDecoderLMState: child state corresponding to usr_index
+        """
+        return super().child(usr_index)
+
+    def compare(self, state: CTCDecoderLMState) -> CTCDecoderLMState:
+        """Compare two language model states.
+
+        Args:
+            state (CTCDecoderLMState): LM state to compare against
+
+        Returns:
+            int: 0 if the states are the same, -1 if self is less, +1 if self is greater.
+        """
+        pass
+
+
+class CTCDecoderLM(_LM):
+    """Language model base class for creating custom language models to use with the decoder."""
+
+    @abstractmethod
+    def start(self, start_with_nothing: bool) -> CTCDecoderLMState:
+        """Initialize or reset the language model.
+
+        Args:
+            start_with_nothing (bool): whether or not to start sentence with sil token.
+
+        Returns:
+            CTCDecoderLMState: starting state
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def score(self, state: CTCDecoderLMState, usr_token_idx: int) -> Tuple[CTCDecoderLMState, float]:
+        """Evaluate the language model based on the current LM state and new word.
+
+        Args:
+            state (CTCDecoderLMState): current LM state
+            usr_token_idx (int): index of the word
+
+        Returns:
+            (CTCDecoderLMState, float)
+                CTCDecoderLMState:
+                    new LM state
+                float:
+                    score
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def finish(self, state: CTCDecoderLMState) -> Tuple[CTCDecoderLMState, float]:
+        """Evaluate end for language model based on current LM state.
+
+        Args:
+            state (CTCDecoderLMState): current LM state
+
+        Returns:
+            (CTCDecoderLMState, float)
+                CTCDecoderLMState:
+                    new LM state
+                float:
+                    score
+        """
+        raise NotImplementedError
+
+
+class CTCDecoder:
+    """CTC beam search decoder from *Flashlight* :cite:`kahn2022flashlight`.
+
+    .. devices:: CPU
+
+    Note:
+        To build the decoder, please use the factory function :func:`ctc_decoder`.
+    """
+
+    def __init__(
+        self,
+        nbest: int,
+        lexicon: Optional[Dict],
+        word_dict: _Dictionary,
+        tokens_dict: _Dictionary,
+        lm: CTCDecoderLM,
+        decoder_options: Union[_LexiconDecoderOptions, _LexiconFreeDecoderOptions],
+        blank_token: str,
+        sil_token: str,
+        unk_word: str,
+    ) -> None:
+        """
+        Args:
+            nbest (int): number of best decodings to return
+            lexicon (Dict or None): lexicon mapping of words to spellings, or None for lexicon-free decoder
+            word_dict (_Dictionary): dictionary of words
+            tokens_dict (_Dictionary): dictionary of tokens
+            lm (CTCDecoderLM): language model. If using a lexicon, only word level LMs are currently supported
+            decoder_options (_LexiconDecoderOptions or _LexiconFreeDecoderOptions):
+                parameters used for beam search decoding
+            blank_token (str): token corresopnding to blank
+            sil_token (str): token corresponding to silence
+            unk_word (str): word corresponding to unknown
+        """
+
+        self.nbest = nbest
+        self.word_dict = word_dict
+        self.tokens_dict = tokens_dict
+        self.blank = self.tokens_dict.get_index(blank_token)
+        silence = self.tokens_dict.get_index(sil_token)
+        transitions = []
+
+        if lexicon:
+            trie = _construct_trie(tokens_dict, word_dict, lexicon, lm, silence)
+            unk_word = word_dict.get_index(unk_word)
+            token_lm = False  # use word level LM
+
+            self.decoder = _LexiconDecoder(
+                decoder_options,
+                trie,
+                lm,
+                silence,
+                self.blank,
+                unk_word,
+                transitions,
+                token_lm,
+            )
+        else:
+            self.decoder = _LexiconFreeDecoder(decoder_options, lm, silence, self.blank, transitions)
+        # https://github.com/pytorch/audio/issues/3218
+        # If lm is passed like rvalue reference, the lm object gets garbage collected,
+        # and later call to the lm fails.
+        # This ensures that lm object is not deleted as long as the decoder is alive.
+        # https://github.com/pybind/pybind11/discussions/4013
+        self.lm = lm
+
+    def _get_tokens(self, idxs: torch.IntTensor) -> torch.LongTensor:
+        idxs = (g[0] for g in it.groupby(idxs))
+        idxs = filter(lambda x: x != self.blank, idxs)
+        return torch.LongTensor(list(idxs))
+
+    def _get_timesteps(self, idxs: torch.IntTensor) -> torch.IntTensor:
+        """Returns frame numbers corresponding to non-blank tokens."""
+
+        timesteps = []
+        for i, idx in enumerate(idxs):
+            if idx == self.blank:
+                continue
+            if i == 0 or idx != idxs[i - 1]:
+                timesteps.append(i)
+        return torch.IntTensor(timesteps)
+
+    def decode_begin(self):
+        """Initialize the internal state of the decoder.
+
+        See :py:meth:`decode_step` for the usage.
+
+        .. note::
+
+           This method is required only when performing online decoding.
+           It is not necessary when performing batch decoding with :py:meth:`__call__`.
+        """
+        self.decoder.decode_begin()
+
+    def decode_end(self):
+        """Finalize the internal state of the decoder.
+
+        See :py:meth:`decode_step` for the usage.
+
+        .. note::
+
+           This method is required only when performing online decoding.
+           It is not necessary when performing batch decoding with :py:meth:`__call__`.
+        """
+        self.decoder.decode_end()
+
+    def decode_step(self, emissions: torch.FloatTensor):
+        """Perform incremental decoding on top of the curent internal state.
+
+        .. note::
+
+           This method is required only when performing online decoding.
+           It is not necessary when performing batch decoding with :py:meth:`__call__`.
+
+        Args:
+            emissions (torch.FloatTensor): CPU tensor of shape `(frame, num_tokens)` storing sequences of
+                probability distribution over labels; output of acoustic model.
+
+        Example:
+            >>> decoder = torchaudio.models.decoder.ctc_decoder(...)
+            >>> decoder.decode_begin()
+            >>> decoder.decode_step(emission1)
+            >>> decoder.decode_step(emission2)
+            >>> decoder.decode_end()
+            >>> result = decoder.get_final_hypothesis()
+        """
+        if emissions.dtype != torch.float32:
+            raise ValueError("emissions must be float32.")
+
+        if not emissions.is_cpu:
+            raise RuntimeError("emissions must be a CPU tensor.")
+
+        if not emissions.is_contiguous():
+            raise RuntimeError("emissions must be contiguous.")
+
+        if emissions.ndim != 2:
+            raise RuntimeError(f"emissions must be 2D. Found {emissions.shape}")
+
+        T, N = emissions.size()
+        self.decoder.decode_step(emissions.data_ptr(), T, N)
+
+    def _to_hypo(self, results) -> List[CTCHypothesis]:
+        return [
+            CTCHypothesis(
+                tokens=self._get_tokens(result.tokens),
+                words=[self.word_dict.get_entry(x) for x in result.words if x >= 0],
+                score=result.score,
+                timesteps=self._get_timesteps(result.tokens),
+            )
+            for result in results
+        ]
+
+    def get_final_hypothesis(self) -> List[CTCHypothesis]:
+        """Get the final hypothesis
+
+        Returns:
+            List[CTCHypothesis]:
+                List of sorted best hypotheses.
+
+        .. note::
+
+           This method is required only when performing online decoding.
+           It is not necessary when performing batch decoding with :py:meth:`__call__`.
+        """
+        results = self.decoder.get_all_final_hypothesis()
+        return self._to_hypo(results[: self.nbest])
+
+    def __call__(
+        self, emissions: torch.FloatTensor, lengths: Optional[torch.Tensor] = None
+    ) -> List[List[CTCHypothesis]]:
+        """
+        Performs batched offline decoding.
+
+        .. note::
+
+           This method performs offline decoding in one go. To perform incremental decoding,
+           please refer to :py:meth:`decode_step`.
+
+        Args:
+            emissions (torch.FloatTensor): CPU tensor of shape `(batch, frame, num_tokens)` storing sequences of
+                probability distribution over labels; output of acoustic model.
+            lengths (Tensor or None, optional): CPU tensor of shape `(batch, )` storing the valid length of
+                in time axis of the output Tensor in each batch.
+
+        Returns:
+            List[List[CTCHypothesis]]:
+                List of sorted best hypotheses for each audio sequence in the batch.
+        """
+
+        if emissions.dtype != torch.float32:
+            raise ValueError("emissions must be float32.")
+
+        if not emissions.is_cpu:
+            raise RuntimeError("emissions must be a CPU tensor.")
+
+        if not emissions.is_contiguous():
+            raise RuntimeError("emissions must be contiguous.")
+
+        if emissions.ndim != 3:
+            raise RuntimeError(f"emissions must be 3D. Found {emissions.shape}")
+
+        if lengths is not None and not lengths.is_cpu:
+            raise RuntimeError("lengths must be a CPU tensor.")
+
+        B, T, N = emissions.size()
+        if lengths is None:
+            lengths = torch.full((B,), T)
+
+        float_bytes = 4
+        hypos = []
+
+        for b in range(B):
+            emissions_ptr = emissions.data_ptr() + float_bytes * b * emissions.stride(0)
+            results = self.decoder.decode(emissions_ptr, lengths[b], N)
+            hypos.append(self._to_hypo(results[: self.nbest]))
+        return hypos
+
+    def idxs_to_tokens(self, idxs: torch.LongTensor) -> List:
+        """
+        Map raw token IDs into corresponding tokens
+
+        Args:
+            idxs (LongTensor): raw token IDs generated from decoder
+
+        Returns:
+            List: tokens corresponding to the input IDs
+        """
+        return [self.tokens_dict.get_entry(idx.item()) for idx in idxs]
+
+
+def ctc_decoder(
+    lexicon: Optional[str],
+    tokens: Union[str, List[str]],
+    lm: Union[str, CTCDecoderLM] = None,
+    lm_dict: Optional[str] = None,
+    nbest: int = 1,
+    beam_size: int = 50,
+    beam_size_token: Optional[int] = None,
+    beam_threshold: float = 50,
+    lm_weight: float = 2,
+    word_score: float = 0,
+    unk_score: float = float("-inf"),
+    sil_score: float = 0,
+    log_add: bool = False,
+    blank_token: str = "-",
+    sil_token: str = "|",
+    unk_word: str = "<unk>",
+) -> CTCDecoder:
+    """Builds an instance of :class:`CTCDecoder`.
+
+    Args:
+        lexicon (str or None): lexicon file containing the possible words and corresponding spellings.
+            Each line consists of a word and its space separated spelling. If `None`, uses lexicon-free
+            decoding.
+        tokens (str or List[str]): file or list containing valid tokens. If using a file, the expected
+            format is for tokens mapping to the same index to be on the same line
+        lm (str, CTCDecoderLM, or None, optional): either a path containing KenLM language model,
+            custom language model of type `CTCDecoderLM`, or `None` if not using a language model
+        lm_dict (str or None, optional): file consisting of the dictionary used for the LM, with a word
+            per line sorted by LM index. If decoding with a lexicon, entries in lm_dict must also occur
+            in the lexicon file. If `None`, dictionary for LM is constructed using the lexicon file.
+            (Default: None)
+        nbest (int, optional): number of best decodings to return (Default: 1)
+        beam_size (int, optional): max number of hypos to hold after each decode step (Default: 50)
+        beam_size_token (int, optional): max number of tokens to consider at each decode step.
+            If `None`, it is set to the total number of tokens (Default: None)
+        beam_threshold (float, optional): threshold for pruning hypothesis (Default: 50)
+        lm_weight (float, optional): weight of language model (Default: 2)
+        word_score (float, optional): word insertion score (Default: 0)
+        unk_score (float, optional): unknown word insertion score (Default: -inf)
+        sil_score (float, optional): silence insertion score (Default: 0)
+        log_add (bool, optional): whether or not to use logadd when merging hypotheses (Default: False)
+        blank_token (str, optional): token corresponding to blank (Default: "-")
+        sil_token (str, optional): token corresponding to silence (Default: "|")
+        unk_word (str, optional): word corresponding to unknown (Default: "<unk>")
+
+    Returns:
+        CTCDecoder: decoder
+
+    Example
+        >>> decoder = ctc_decoder(
+        >>>     lexicon="lexicon.txt",
+        >>>     tokens="tokens.txt",
+        >>>     lm="kenlm.bin",
+        >>> )
+        >>> results = decoder(emissions) # List of shape (B, nbest) of Hypotheses
+    """
+    if lm_dict is not None and type(lm_dict) is not str:
+        raise ValueError("lm_dict must be None or str type.")
+
+    tokens_dict = _Dictionary(tokens)
+
+    # decoder options
+    if lexicon:
+        lexicon = _load_words(lexicon)
+        decoder_options = _LexiconDecoderOptions(
+            beam_size=beam_size,
+            beam_size_token=beam_size_token or tokens_dict.index_size(),
+            beam_threshold=beam_threshold,
+            lm_weight=lm_weight,
+            word_score=word_score,
+            unk_score=unk_score,
+            sil_score=sil_score,
+            log_add=log_add,
+            criterion_type=_CriterionType.CTC,
+        )
+    else:
+        decoder_options = _LexiconFreeDecoderOptions(
+            beam_size=beam_size,
+            beam_size_token=beam_size_token or tokens_dict.index_size(),
+            beam_threshold=beam_threshold,
+            lm_weight=lm_weight,
+            sil_score=sil_score,
+            log_add=log_add,
+            criterion_type=_CriterionType.CTC,
+        )
+
+    # construct word dict and language model
+    word_dict = _get_word_dict(lexicon, lm, lm_dict, tokens_dict, unk_word)
+
+    if type(lm) == str:
+        if _KenLM is None:
+            raise RuntimeError(
+                "flashlight-text is installed, but KenLM is not installed. "
+                "Please refer to https://github.com/kpu/kenlm#python-module for how to install it."
+            )
+        lm = _KenLM(lm, word_dict)
+    elif lm is None:
+        lm = _ZeroLM()
+
+    return CTCDecoder(
+        nbest=nbest,
+        lexicon=lexicon,
+        word_dict=word_dict,
+        tokens_dict=tokens_dict,
+        lm=lm,
+        decoder_options=decoder_options,
+        blank_token=blank_token,
+        sil_token=sil_token,
+        unk_word=unk_word,
+    )
+
+
+def _get_filenames(model: str) -> _PretrainedFiles:
+    if model not in ["librispeech", "librispeech-3-gram", "librispeech-4-gram"]:
+        raise ValueError(
+            f"{model} not supported. Must be one of ['librispeech-3-gram', 'librispeech-4-gram', 'librispeech']"
+        )
+
+    prefix = f"decoder-assets/{model}"
+    return _PretrainedFiles(
+        lexicon=f"{prefix}/lexicon.txt",
+        tokens=f"{prefix}/tokens.txt",
+        lm=f"{prefix}/lm.bin" if model != "librispeech" else None,
+    )
+
+
+def download_pretrained_files(model: str) -> _PretrainedFiles:
+    """
+    Retrieves pretrained data files used for :func:`ctc_decoder`.
+
+    Args:
+        model (str): pretrained language model to download.
+            Valid values are: ``"librispeech-3-gram"``, ``"librispeech-4-gram"`` and ``"librispeech"``.
+
+    Returns:
+        Object with the following attributes
+
+            * ``lm``: path corresponding to downloaded language model,
+              or ``None`` if the model is not associated with an lm
+            * ``lexicon``: path corresponding to downloaded lexicon file
+            * ``tokens``: path corresponding to downloaded tokens file
+    """
+
+    files = _get_filenames(model)
+    lexicon_file = download_asset(files.lexicon)
+    tokens_file = download_asset(files.tokens)
+    if files.lm is not None:
+        lm_file = download_asset(files.lm)
+    else:
+        lm_file = None
+
+    return _PretrainedFiles(
+        lexicon=lexicon_file,
+        tokens=tokens_file,
+        lm=lm_file,
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/models/decoder/_cuda_ctc_decoder.py b/MLPY/Lib/site-packages/torchaudio/models/decoder/_cuda_ctc_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..aebba02661cb25c5c14b42680fb77c7b5964e6e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/decoder/_cuda_ctc_decoder.py
@@ -0,0 +1,187 @@
+from __future__ import annotations
+
+import math
+
+from typing import List, NamedTuple, Union
+
+import torch
+import torchaudio
+
+torchaudio._extension._load_lib("libctc_prefix_decoder")
+import torchaudio.lib.pybind11_prefixctc as cuctc
+
+
+__all__ = ["CUCTCHypothesis", "CUCTCDecoder", "cuda_ctc_decoder"]
+
+
+def _get_vocab_list(vocab_file):
+    vocab = []
+    with open(vocab_file, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip().split()
+            vocab.append(line[0])
+    return vocab
+
+
+class CUCTCHypothesis(NamedTuple):
+    r"""Represents hypothesis generated by CUCTC beam search decoder :class:`CUCTCDecoder`."""
+    tokens: List[int]
+    """Predicted sequence of token IDs. Shape `(L, )`, where `L` is the length of the output sequence"""
+
+    words: List[str]
+    """List of predicted tokens. Algin with modeling unit.
+    """
+
+    score: float
+    """Score corresponding to hypothesis"""
+
+
+_DEFAULT_BLANK_SKIP_THREASHOLD = 0.95
+
+
+class CUCTCDecoder:
+    """CUDA CTC beam search decoder.
+
+    .. devices:: CUDA
+
+    Note:
+        To build the decoder, please use the factory function :func:`cuda_ctc_decoder`.
+    """
+
+    def __init__(
+        self,
+        vocab_list: List[str],
+        blank_id: int = 0,
+        beam_size: int = 10,
+        nbest: int = 1,
+        blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
+        cuda_stream: torch.cuda.streams.Stream = None,
+    ):
+        """
+        Args:
+            blank_id (int): token id corresopnding to blank, only support 0 for now. (Default: 0)
+            vocab_list (List[str]): list of vocabulary tokens
+            beam_size (int, optional): max number of hypos to hold after each decode step (Default: 10)
+            nbest (int): number of best decodings to return
+            blank_skip_threshold (float):
+                skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding.
+                (Default: 0.95).
+            cuda_stream (torch.cuda.streams.Stream): using assigned cuda stream (Default: using default stream)
+
+        """
+        if cuda_stream:
+            if not isinstance(cuda_stream, torch.cuda.streams.Stream):
+                raise AssertionError("cuda_stream must be torch.cuda.streams.Stream")
+        cuda_stream_ = cuda_stream.cuda_stream if cuda_stream else torch.cuda.current_stream().cuda_stream
+        self.internal_data = cuctc.prefixCTC_alloc(cuda_stream_)
+        self.memory = torch.empty(0, dtype=torch.int8, device=torch.device("cuda"))
+        if blank_id != 0:
+            raise AssertionError("blank_id must be 0")
+        self.blank_id = blank_id
+        self.vocab_list = vocab_list
+        self.space_id = 0
+        self.nbest = nbest
+        if not (blank_skip_threshold >= 0 and blank_skip_threshold <= 1):
+            raise AssertionError("blank_skip_threshold must be between 0 and 1")
+        self.blank_skip_threshold = math.log(blank_skip_threshold)
+        self.beam_size = min(beam_size, len(vocab_list))  # beam size must be smaller than vocab size
+
+    def __del__(self):
+        if cuctc is not None:
+            cuctc.prefixCTC_free(self.internal_data)
+
+    def __call__(self, log_prob: torch.Tensor, encoder_out_lens: torch.Tensor):
+        """
+        Args:
+            log_prob (torch.FloatTensor): GPU tensor of shape `(batch, frame, num_tokens)` storing sequences of
+                probability distribution over labels; log_softmax(output of acoustic model).
+            lengths (dtype torch.int32): GPU tensor of shape `(batch, )` storing the valid length of
+                in time axis of the output Tensor in each batch.
+
+        Returns:
+            List[List[CUCTCHypothesis]]:
+                List of sorted best hypotheses for each audio sequence in the batch.
+        """
+        if not encoder_out_lens.dtype == torch.int32:
+            raise AssertionError("encoder_out_lens must be torch.int32")
+        if not log_prob.dtype == torch.float32:
+            raise AssertionError("log_prob must be torch.float32")
+        if not (log_prob.is_cuda and encoder_out_lens.is_cuda):
+            raise AssertionError("inputs must be cuda tensors")
+        if not (log_prob.is_contiguous() and encoder_out_lens.is_contiguous()):
+            raise AssertionError("input tensors must be contiguous")
+        required_size, score_hyps = cuctc.ctc_beam_search_decoder_batch_gpu_v2(
+            self.internal_data,
+            self.memory.data_ptr(),
+            self.memory.size(0),
+            log_prob.data_ptr(),
+            encoder_out_lens.data_ptr(),
+            log_prob.size(),
+            log_prob.stride(),
+            self.beam_size,
+            self.blank_id,
+            self.space_id,
+            self.blank_skip_threshold,
+        )
+        if required_size > 0:
+            self.memory = torch.empty(required_size, dtype=torch.int8, device=log_prob.device).contiguous()
+            _, score_hyps = cuctc.ctc_beam_search_decoder_batch_gpu_v2(
+                self.internal_data,
+                self.memory.data_ptr(),
+                self.memory.size(0),
+                log_prob.data_ptr(),
+                encoder_out_lens.data_ptr(),
+                log_prob.size(),
+                log_prob.stride(),
+                self.beam_size,
+                self.blank_id,
+                self.space_id,
+                self.blank_skip_threshold,
+            )
+        batch_size = len(score_hyps)
+        hypos = []
+        for i in range(batch_size):
+            hypos.append(
+                [
+                    CUCTCHypothesis(
+                        tokens=score_hyps[i][j][1],
+                        words=[self.vocab_list[word_id] for word_id in score_hyps[i][j][1]],
+                        score=score_hyps[i][j][0],
+                    )
+                    for j in range(self.nbest)
+                ]
+            )
+        return hypos
+
+
+def cuda_ctc_decoder(
+    tokens: Union[str, List[str]],
+    nbest: int = 1,
+    beam_size: int = 10,
+    blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
+) -> CUCTCDecoder:
+    """Builds an instance of :class:`CUCTCDecoder`.
+
+    Args:
+        tokens (str or List[str]): File or list containing valid tokens.
+            If using a file, the expected format is for tokens mapping to the same index to be on the same line
+        beam_size (int, optional): The maximum number of hypos to hold after each decode step (Default: 10)
+        nbest (int): The number of best decodings to return
+        blank_id (int): The token ID corresopnding to the blank symbol.
+        blank_skip_threshold (float): skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding
+            (Default: 0.95).
+
+    Returns:
+        CUCTCDecoder: decoder
+
+    Example
+        >>> decoder = cuda_ctc_decoder(
+        >>>     vocab_file="tokens.txt",
+        >>>     blank_skip_threshold=0.95,
+        >>> )
+        >>> results = decoder(log_probs, encoder_out_lens) # List of shape (B, nbest) of Hypotheses
+    """
+    if type(tokens) == str:
+        tokens = _get_vocab_list(tokens)
+
+    return CUCTCDecoder(vocab_list=tokens, beam_size=beam_size, nbest=nbest, blank_skip_threshold=blank_skip_threshold)
diff --git a/MLPY/Lib/site-packages/torchaudio/models/deepspeech.py b/MLPY/Lib/site-packages/torchaudio/models/deepspeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a6a0faa006a3fa6868ccb7e39e68118d8dbe277
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/deepspeech.py
@@ -0,0 +1,84 @@
+import torch
+
+__all__ = ["DeepSpeech"]
+
+
+class FullyConnected(torch.nn.Module):
+    """
+    Args:
+        n_feature: Number of input features
+        n_hidden: Internal hidden unit size.
+    """
+
+    def __init__(self, n_feature: int, n_hidden: int, dropout: float, relu_max_clip: int = 20) -> None:
+        super(FullyConnected, self).__init__()
+        self.fc = torch.nn.Linear(n_feature, n_hidden, bias=True)
+        self.relu_max_clip = relu_max_clip
+        self.dropout = dropout
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc(x)
+        x = torch.nn.functional.relu(x)
+        x = torch.nn.functional.hardtanh(x, 0, self.relu_max_clip)
+        if self.dropout:
+            x = torch.nn.functional.dropout(x, self.dropout, self.training)
+        return x
+
+
+class DeepSpeech(torch.nn.Module):
+    """DeepSpeech architecture introduced in
+    *Deep Speech: Scaling up end-to-end speech recognition* :cite:`hannun2014deep`.
+
+    Args:
+        n_feature: Number of input features
+        n_hidden: Internal hidden unit size.
+        n_class: Number of output classes
+    """
+
+    def __init__(
+        self,
+        n_feature: int,
+        n_hidden: int = 2048,
+        n_class: int = 40,
+        dropout: float = 0.0,
+    ) -> None:
+        super(DeepSpeech, self).__init__()
+        self.n_hidden = n_hidden
+        self.fc1 = FullyConnected(n_feature, n_hidden, dropout)
+        self.fc2 = FullyConnected(n_hidden, n_hidden, dropout)
+        self.fc3 = FullyConnected(n_hidden, n_hidden, dropout)
+        self.bi_rnn = torch.nn.RNN(n_hidden, n_hidden, num_layers=1, nonlinearity="relu", bidirectional=True)
+        self.fc4 = FullyConnected(n_hidden, n_hidden, dropout)
+        self.out = torch.nn.Linear(n_hidden, n_class)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): Tensor of dimension (batch, channel, time, feature).
+        Returns:
+            Tensor: Predictor tensor of dimension (batch, time, class).
+        """
+        # N x C x T x F
+        x = self.fc1(x)
+        # N x C x T x H
+        x = self.fc2(x)
+        # N x C x T x H
+        x = self.fc3(x)
+        # N x C x T x H
+        x = x.squeeze(1)
+        # N x T x H
+        x = x.transpose(0, 1)
+        # T x N x H
+        x, _ = self.bi_rnn(x)
+        # The fifth (non-recurrent) layer takes both the forward and backward units as inputs
+        x = x[:, :, : self.n_hidden] + x[:, :, self.n_hidden :]
+        # T x N x H
+        x = self.fc4(x)
+        # T x N x H
+        x = self.out(x)
+        # T x N x n_class
+        x = x.permute(1, 0, 2)
+        # N x T x n_class
+        x = torch.nn.functional.log_softmax(x, dim=2)
+        # N x T x n_class
+        return x
diff --git a/MLPY/Lib/site-packages/torchaudio/models/emformer.py b/MLPY/Lib/site-packages/torchaudio/models/emformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa678869c07126a9a5556d35f40ca3324b3fe6b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/emformer.py
@@ -0,0 +1,884 @@
+import math
+from typing import List, Optional, Tuple
+
+import torch
+
+
+__all__ = ["Emformer"]
+
+
+def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor:
+    batch_size = lengths.shape[0]
+    max_length = int(torch.max(lengths).item())
+    padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand(
+        batch_size, max_length
+    ) >= lengths.unsqueeze(1)
+    return padding_mask
+
+
+def _gen_padding_mask(
+    utterance: torch.Tensor,
+    right_context: torch.Tensor,
+    summary: torch.Tensor,
+    lengths: torch.Tensor,
+    mems: torch.Tensor,
+    left_context_key: Optional[torch.Tensor] = None,
+) -> Optional[torch.Tensor]:
+    T = right_context.size(0) + utterance.size(0) + summary.size(0)
+    B = right_context.size(1)
+    if B == 1:
+        padding_mask = None
+    else:
+        right_context_blocks_length = T - torch.max(lengths).int() - summary.size(0)
+        left_context_blocks_length = left_context_key.size(0) if left_context_key is not None else 0
+        klengths = lengths + mems.size(0) + right_context_blocks_length + left_context_blocks_length
+        padding_mask = _lengths_to_padding_mask(lengths=klengths)
+    return padding_mask
+
+
+def _get_activation_module(activation: str) -> torch.nn.Module:
+    if activation == "relu":
+        return torch.nn.ReLU()
+    elif activation == "gelu":
+        return torch.nn.GELU()
+    elif activation == "silu":
+        return torch.nn.SiLU()
+    else:
+        raise ValueError(f"Unsupported activation {activation}")
+
+
+def _get_weight_init_gains(weight_init_scale_strategy: Optional[str], num_layers: int) -> List[Optional[float]]:
+    if weight_init_scale_strategy is None:
+        return [None for _ in range(num_layers)]
+    elif weight_init_scale_strategy == "depthwise":
+        return [1.0 / math.sqrt(layer_idx + 1) for layer_idx in range(num_layers)]
+    elif weight_init_scale_strategy == "constant":
+        return [1.0 / math.sqrt(2) for layer_idx in range(num_layers)]
+    else:
+        raise ValueError(f"Unsupported weight_init_scale_strategy value {weight_init_scale_strategy}")
+
+
+def _gen_attention_mask_block(
+    col_widths: List[int], col_mask: List[bool], num_rows: int, device: torch.device
+) -> torch.Tensor:
+    if len(col_widths) != len(col_mask):
+        raise ValueError("Length of col_widths must match that of col_mask")
+
+    mask_block = [
+        torch.ones(num_rows, col_width, device=device)
+        if is_ones_col
+        else torch.zeros(num_rows, col_width, device=device)
+        for col_width, is_ones_col in zip(col_widths, col_mask)
+    ]
+    return torch.cat(mask_block, dim=1)
+
+
+class _EmformerAttention(torch.nn.Module):
+    r"""Emformer layer attention module.
+
+    Args:
+        input_dim (int): input dimension.
+        num_heads (int): number of attention heads in each Emformer layer.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        weight_init_gain (float or None, optional): scale factor to apply when initializing
+            attention module parameters. (Default: ``None``)
+        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
+        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        weight_init_gain: Optional[float] = None,
+        tanh_on_mem: bool = False,
+        negative_inf: float = -1e8,
+    ):
+        super().__init__()
+
+        if input_dim % num_heads != 0:
+            raise ValueError(f"input_dim ({input_dim}) is not a multiple of num_heads ({num_heads}).")
+
+        self.input_dim = input_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.tanh_on_mem = tanh_on_mem
+        self.negative_inf = negative_inf
+
+        self.scaling = (self.input_dim // self.num_heads) ** -0.5
+
+        self.emb_to_key_value = torch.nn.Linear(input_dim, 2 * input_dim, bias=True)
+        self.emb_to_query = torch.nn.Linear(input_dim, input_dim, bias=True)
+        self.out_proj = torch.nn.Linear(input_dim, input_dim, bias=True)
+
+        if weight_init_gain:
+            torch.nn.init.xavier_uniform_(self.emb_to_key_value.weight, gain=weight_init_gain)
+            torch.nn.init.xavier_uniform_(self.emb_to_query.weight, gain=weight_init_gain)
+
+    def _gen_key_value(self, input: torch.Tensor, mems: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        T, _, _ = input.shape
+        summary_length = mems.size(0) + 1
+        right_ctx_utterance_block = input[: T - summary_length]
+        mems_right_ctx_utterance_block = torch.cat([mems, right_ctx_utterance_block])
+        key, value = self.emb_to_key_value(mems_right_ctx_utterance_block).chunk(chunks=2, dim=2)
+        return key, value
+
+    def _gen_attention_probs(
+        self,
+        attention_weights: torch.Tensor,
+        attention_mask: torch.Tensor,
+        padding_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        attention_weights_float = attention_weights.float()
+        attention_weights_float = attention_weights_float.masked_fill(attention_mask.unsqueeze(0), self.negative_inf)
+        T = attention_weights.size(1)
+        B = attention_weights.size(0) // self.num_heads
+        if padding_mask is not None:
+            attention_weights_float = attention_weights_float.view(B, self.num_heads, T, -1)
+            attention_weights_float = attention_weights_float.masked_fill(
+                padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), self.negative_inf
+            )
+            attention_weights_float = attention_weights_float.view(B * self.num_heads, T, -1)
+        attention_probs = torch.nn.functional.softmax(attention_weights_float, dim=-1).type_as(attention_weights)
+        return torch.nn.functional.dropout(attention_probs, p=float(self.dropout), training=self.training)
+
+    def _forward_impl(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        summary: torch.Tensor,
+        mems: torch.Tensor,
+        attention_mask: torch.Tensor,
+        left_context_key: Optional[torch.Tensor] = None,
+        left_context_val: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        B = utterance.size(1)
+        T = right_context.size(0) + utterance.size(0) + summary.size(0)
+
+        # Compute query with [right context, utterance, summary].
+        query = self.emb_to_query(torch.cat([right_context, utterance, summary]))
+
+        # Compute key and value with [mems, right context, utterance].
+        key, value = self.emb_to_key_value(torch.cat([mems, right_context, utterance])).chunk(chunks=2, dim=2)
+
+        if left_context_key is not None and left_context_val is not None:
+            right_context_blocks_length = T - torch.max(lengths).int() - summary.size(0)
+            key = torch.cat(
+                [
+                    key[: mems.size(0) + right_context_blocks_length],
+                    left_context_key,
+                    key[mems.size(0) + right_context_blocks_length :],
+                ],
+            )
+            value = torch.cat(
+                [
+                    value[: mems.size(0) + right_context_blocks_length],
+                    left_context_val,
+                    value[mems.size(0) + right_context_blocks_length :],
+                ],
+            )
+
+        # Compute attention weights from query, key, and value.
+        reshaped_query, reshaped_key, reshaped_value = [
+            tensor.contiguous().view(-1, B * self.num_heads, self.input_dim // self.num_heads).transpose(0, 1)
+            for tensor in [query, key, value]
+        ]
+        attention_weights = torch.bmm(reshaped_query * self.scaling, reshaped_key.transpose(1, 2))
+
+        # Compute padding mask.
+        padding_mask = _gen_padding_mask(utterance, right_context, summary, lengths, mems, left_context_key)
+
+        # Compute attention probabilities.
+        attention_probs = self._gen_attention_probs(attention_weights, attention_mask, padding_mask)
+
+        # Compute attention.
+        attention = torch.bmm(attention_probs, reshaped_value)
+        if attention.shape != (
+            B * self.num_heads,
+            T,
+            self.input_dim // self.num_heads,
+        ):
+            raise AssertionError("Computed attention has incorrect dimensions")
+        attention = attention.transpose(0, 1).contiguous().view(T, B, self.input_dim)
+
+        # Apply output projection.
+        output_right_context_mems = self.out_proj(attention)
+
+        summary_length = summary.size(0)
+        output_right_context = output_right_context_mems[: T - summary_length]
+        output_mems = output_right_context_mems[T - summary_length :]
+        if self.tanh_on_mem:
+            output_mems = torch.tanh(output_mems)
+        else:
+            output_mems = torch.clamp(output_mems, min=-10, max=10)
+
+        return output_right_context, output_mems, key, value
+
+    def forward(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        summary: torch.Tensor,
+        mems: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        D: feature dimension of each frame;
+        T: number of utterance frames;
+        R: number of right context frames;
+        S: number of summary elements;
+        M: number of memory elements.
+
+        Args:
+            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``utterance``.
+            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
+            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
+            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
+            attention_mask (torch.Tensor): attention mask for underlying attention module.
+
+        Returns:
+            (Tensor, Tensor):
+                Tensor
+                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
+                Tensor
+                    updated memory elements, with shape `(M, B, D)`.
+        """
+        output, output_mems, _, _ = self._forward_impl(utterance, lengths, right_context, summary, mems, attention_mask)
+        return output, output_mems[:-1]
+
+    @torch.jit.export
+    def infer(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        summary: torch.Tensor,
+        mems: torch.Tensor,
+        left_context_key: torch.Tensor,
+        left_context_val: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for inference.
+
+        B: batch size;
+        D: feature dimension of each frame;
+        T: number of utterance frames;
+        R: number of right context frames;
+        S: number of summary elements;
+        M: number of memory elements.
+
+        Args:
+            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``utterance``.
+            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
+            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
+            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
+            left_context_key (torch.Tensor): left context attention key computed from preceding invocation.
+            left_context_val (torch.Tensor): left context attention value computed from preceding invocation.
+
+        Returns:
+            (Tensor, Tensor, Tensor, and Tensor):
+                Tensor
+                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
+                Tensor
+                    updated memory elements, with shape `(M, B, D)`.
+                Tensor
+                    attention key computed for left context and utterance.
+                Tensor
+                    attention value computed for left context and utterance.
+        """
+        query_dim = right_context.size(0) + utterance.size(0) + summary.size(0)
+        key_dim = right_context.size(0) + utterance.size(0) + mems.size(0) + left_context_key.size(0)
+        attention_mask = torch.zeros(query_dim, key_dim).to(dtype=torch.bool, device=utterance.device)
+        attention_mask[-1, : mems.size(0)] = True
+        output, output_mems, key, value = self._forward_impl(
+            utterance,
+            lengths,
+            right_context,
+            summary,
+            mems,
+            attention_mask,
+            left_context_key=left_context_key,
+            left_context_val=left_context_val,
+        )
+        return (
+            output,
+            output_mems,
+            key[mems.size(0) + right_context.size(0) :],
+            value[mems.size(0) + right_context.size(0) :],
+        )
+
+
+class _EmformerLayer(torch.nn.Module):
+    r"""Emformer layer that constitutes Emformer.
+
+    Args:
+        input_dim (int): input dimension.
+        num_heads (int): number of attention heads.
+        ffn_dim: (int): hidden layer dimension of feedforward network.
+        segment_length (int): length of each input segment.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        activation (str, optional): activation function to use in feedforward network.
+            Must be one of ("relu", "gelu", "silu"). (Default: "relu")
+        left_context_length (int, optional): length of left context. (Default: 0)
+        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
+        weight_init_gain (float or None, optional): scale factor to apply when initializing
+            attention module parameters. (Default: ``None``)
+        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
+        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        segment_length: int,
+        dropout: float = 0.0,
+        activation: str = "relu",
+        left_context_length: int = 0,
+        max_memory_size: int = 0,
+        weight_init_gain: Optional[float] = None,
+        tanh_on_mem: bool = False,
+        negative_inf: float = -1e8,
+    ):
+        super().__init__()
+
+        self.attention = _EmformerAttention(
+            input_dim=input_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            weight_init_gain=weight_init_gain,
+            tanh_on_mem=tanh_on_mem,
+            negative_inf=negative_inf,
+        )
+        self.dropout = torch.nn.Dropout(dropout)
+        self.memory_op = torch.nn.AvgPool1d(kernel_size=segment_length, stride=segment_length, ceil_mode=True)
+
+        activation_module = _get_activation_module(activation)
+        self.pos_ff = torch.nn.Sequential(
+            torch.nn.LayerNorm(input_dim),
+            torch.nn.Linear(input_dim, ffn_dim),
+            activation_module,
+            torch.nn.Dropout(dropout),
+            torch.nn.Linear(ffn_dim, input_dim),
+            torch.nn.Dropout(dropout),
+        )
+        self.layer_norm_input = torch.nn.LayerNorm(input_dim)
+        self.layer_norm_output = torch.nn.LayerNorm(input_dim)
+
+        self.left_context_length = left_context_length
+        self.segment_length = segment_length
+        self.max_memory_size = max_memory_size
+        self.input_dim = input_dim
+
+        self.use_mem = max_memory_size > 0
+
+    def _init_state(self, batch_size: int, device: Optional[torch.device]) -> List[torch.Tensor]:
+        empty_memory = torch.zeros(self.max_memory_size, batch_size, self.input_dim, device=device)
+        left_context_key = torch.zeros(self.left_context_length, batch_size, self.input_dim, device=device)
+        left_context_val = torch.zeros(self.left_context_length, batch_size, self.input_dim, device=device)
+        past_length = torch.zeros(1, batch_size, dtype=torch.int32, device=device)
+        return [empty_memory, left_context_key, left_context_val, past_length]
+
+    def _unpack_state(self, state: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        past_length = state[3][0][0].item()
+        past_left_context_length = min(self.left_context_length, past_length)
+        past_mem_length = min(self.max_memory_size, math.ceil(past_length / self.segment_length))
+        pre_mems = state[0][self.max_memory_size - past_mem_length :]
+        lc_key = state[1][self.left_context_length - past_left_context_length :]
+        lc_val = state[2][self.left_context_length - past_left_context_length :]
+        return pre_mems, lc_key, lc_val
+
+    def _pack_state(
+        self,
+        next_k: torch.Tensor,
+        next_v: torch.Tensor,
+        update_length: int,
+        mems: torch.Tensor,
+        state: List[torch.Tensor],
+    ) -> List[torch.Tensor]:
+        new_k = torch.cat([state[1], next_k])
+        new_v = torch.cat([state[2], next_v])
+        state[0] = torch.cat([state[0], mems])[-self.max_memory_size :]
+        state[1] = new_k[new_k.shape[0] - self.left_context_length :]
+        state[2] = new_v[new_v.shape[0] - self.left_context_length :]
+        state[3] = state[3] + update_length
+        return state
+
+    def _process_attention_output(
+        self,
+        rc_output: torch.Tensor,
+        utterance: torch.Tensor,
+        right_context: torch.Tensor,
+    ) -> torch.Tensor:
+        result = self.dropout(rc_output) + torch.cat([right_context, utterance])
+        result = self.pos_ff(result) + result
+        result = self.layer_norm_output(result)
+        return result
+
+    def _apply_pre_attention_layer_norm(
+        self, utterance: torch.Tensor, right_context: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        layer_norm_input = self.layer_norm_input(torch.cat([right_context, utterance]))
+        return (
+            layer_norm_input[right_context.size(0) :],
+            layer_norm_input[: right_context.size(0)],
+        )
+
+    def _apply_post_attention_ffn(
+        self, rc_output: torch.Tensor, utterance: torch.Tensor, right_context: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        rc_output = self._process_attention_output(rc_output, utterance, right_context)
+        return rc_output[right_context.size(0) :], rc_output[: right_context.size(0)]
+
+    def _apply_attention_forward(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        mems: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if attention_mask is None:
+            raise ValueError("attention_mask must be not None when for_inference is False")
+
+        if self.use_mem:
+            summary = self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
+        else:
+            summary = torch.empty(0).to(dtype=utterance.dtype, device=utterance.device)
+        rc_output, next_m = self.attention(
+            utterance=utterance,
+            lengths=lengths,
+            right_context=right_context,
+            summary=summary,
+            mems=mems,
+            attention_mask=attention_mask,
+        )
+        return rc_output, next_m
+
+    def _apply_attention_infer(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        mems: torch.Tensor,
+        state: Optional[List[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
+        if state is None:
+            state = self._init_state(utterance.size(1), device=utterance.device)
+        pre_mems, lc_key, lc_val = self._unpack_state(state)
+        if self.use_mem:
+            summary = self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
+            summary = summary[:1]
+        else:
+            summary = torch.empty(0).to(dtype=utterance.dtype, device=utterance.device)
+        rc_output, next_m, next_k, next_v = self.attention.infer(
+            utterance=utterance,
+            lengths=lengths,
+            right_context=right_context,
+            summary=summary,
+            mems=pre_mems,
+            left_context_key=lc_key,
+            left_context_val=lc_val,
+        )
+        state = self._pack_state(next_k, next_v, utterance.size(0), mems, state)
+        return rc_output, next_m, state
+
+    def forward(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        mems: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        D: feature dimension of each frame;
+        T: number of utterance frames;
+        R: number of right context frames;
+        M: number of memory elements.
+
+        Args:
+            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``utterance``.
+            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
+            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
+            attention_mask (torch.Tensor): attention mask for underlying attention module.
+
+        Returns:
+            (Tensor, Tensor, Tensor):
+                Tensor
+                    encoded utterance frames, with shape `(T, B, D)`.
+                Tensor
+                    updated right context frames, with shape `(R, B, D)`.
+                Tensor
+                    updated memory elements, with shape `(M, B, D)`.
+        """
+        (
+            layer_norm_utterance,
+            layer_norm_right_context,
+        ) = self._apply_pre_attention_layer_norm(utterance, right_context)
+        rc_output, output_mems = self._apply_attention_forward(
+            layer_norm_utterance,
+            lengths,
+            layer_norm_right_context,
+            mems,
+            attention_mask,
+        )
+        output_utterance, output_right_context = self._apply_post_attention_ffn(rc_output, utterance, right_context)
+        return output_utterance, output_right_context, output_mems
+
+    @torch.jit.export
+    def infer(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        state: Optional[List[torch.Tensor]],
+        mems: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], torch.Tensor]:
+        r"""Forward pass for inference.
+
+        B: batch size;
+        D: feature dimension of each frame;
+        T: number of utterance frames;
+        R: number of right context frames;
+        M: number of memory elements.
+
+        Args:
+            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``utterance``.
+            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
+            state (List[torch.Tensor] or None): list of tensors representing layer internal state
+                generated in preceding invocation of ``infer``.
+            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
+
+        Returns:
+            (Tensor, Tensor, List[torch.Tensor], Tensor):
+                Tensor
+                    encoded utterance frames, with shape `(T, B, D)`.
+                Tensor
+                    updated right context frames, with shape `(R, B, D)`.
+                List[Tensor]
+                    list of tensors representing layer internal state
+                    generated in current invocation of ``infer``.
+                Tensor
+                    updated memory elements, with shape `(M, B, D)`.
+        """
+        (
+            layer_norm_utterance,
+            layer_norm_right_context,
+        ) = self._apply_pre_attention_layer_norm(utterance, right_context)
+        rc_output, output_mems, output_state = self._apply_attention_infer(
+            layer_norm_utterance, lengths, layer_norm_right_context, mems, state
+        )
+        output_utterance, output_right_context = self._apply_post_attention_ffn(rc_output, utterance, right_context)
+        return output_utterance, output_right_context, output_state, output_mems
+
+
+class _EmformerImpl(torch.nn.Module):
+    def __init__(
+        self,
+        emformer_layers: torch.nn.ModuleList,
+        segment_length: int,
+        left_context_length: int = 0,
+        right_context_length: int = 0,
+        max_memory_size: int = 0,
+    ):
+        super().__init__()
+
+        self.use_mem = max_memory_size > 0
+        self.memory_op = torch.nn.AvgPool1d(
+            kernel_size=segment_length,
+            stride=segment_length,
+            ceil_mode=True,
+        )
+        self.emformer_layers = emformer_layers
+        self.left_context_length = left_context_length
+        self.right_context_length = right_context_length
+        self.segment_length = segment_length
+        self.max_memory_size = max_memory_size
+
+    def _gen_right_context(self, input: torch.Tensor) -> torch.Tensor:
+        T = input.shape[0]
+        num_segs = math.ceil((T - self.right_context_length) / self.segment_length)
+        right_context_blocks = []
+        for seg_idx in range(num_segs - 1):
+            start = (seg_idx + 1) * self.segment_length
+            end = start + self.right_context_length
+            right_context_blocks.append(input[start:end])
+        right_context_blocks.append(input[T - self.right_context_length :])
+        return torch.cat(right_context_blocks)
+
+    def _gen_attention_mask_col_widths(self, seg_idx: int, utterance_length: int) -> List[int]:
+        num_segs = math.ceil(utterance_length / self.segment_length)
+        rc = self.right_context_length
+        lc = self.left_context_length
+        rc_start = seg_idx * rc
+        rc_end = rc_start + rc
+        seg_start = max(seg_idx * self.segment_length - lc, 0)
+        seg_end = min((seg_idx + 1) * self.segment_length, utterance_length)
+        rc_length = self.right_context_length * num_segs
+
+        if self.use_mem:
+            m_start = max(seg_idx - self.max_memory_size, 0)
+            mem_length = num_segs - 1
+            col_widths = [
+                m_start,  # before memory
+                seg_idx - m_start,  # memory
+                mem_length - seg_idx,  # after memory
+                rc_start,  # before right context
+                rc,  # right context
+                rc_length - rc_end,  # after right context
+                seg_start,  # before query segment
+                seg_end - seg_start,  # query segment
+                utterance_length - seg_end,  # after query segment
+            ]
+        else:
+            col_widths = [
+                rc_start,  # before right context
+                rc,  # right context
+                rc_length - rc_end,  # after right context
+                seg_start,  # before query segment
+                seg_end - seg_start,  # query segment
+                utterance_length - seg_end,  # after query segment
+            ]
+
+        return col_widths
+
+    def _gen_attention_mask(self, input: torch.Tensor) -> torch.Tensor:
+        utterance_length = input.size(0)
+        num_segs = math.ceil(utterance_length / self.segment_length)
+
+        rc_mask = []
+        query_mask = []
+        summary_mask = []
+
+        if self.use_mem:
+            num_cols = 9
+            # memory, right context, query segment
+            rc_q_cols_mask = [idx in [1, 4, 7] for idx in range(num_cols)]
+            # right context, query segment
+            s_cols_mask = [idx in [4, 7] for idx in range(num_cols)]
+            masks_to_concat = [rc_mask, query_mask, summary_mask]
+        else:
+            num_cols = 6
+            # right context, query segment
+            rc_q_cols_mask = [idx in [1, 4] for idx in range(num_cols)]
+            s_cols_mask = None
+            masks_to_concat = [rc_mask, query_mask]
+
+        for seg_idx in range(num_segs):
+            col_widths = self._gen_attention_mask_col_widths(seg_idx, utterance_length)
+
+            rc_mask_block = _gen_attention_mask_block(
+                col_widths, rc_q_cols_mask, self.right_context_length, input.device
+            )
+            rc_mask.append(rc_mask_block)
+
+            query_mask_block = _gen_attention_mask_block(
+                col_widths,
+                rc_q_cols_mask,
+                min(
+                    self.segment_length,
+                    utterance_length - seg_idx * self.segment_length,
+                ),
+                input.device,
+            )
+            query_mask.append(query_mask_block)
+
+            if s_cols_mask is not None:
+                summary_mask_block = _gen_attention_mask_block(col_widths, s_cols_mask, 1, input.device)
+                summary_mask.append(summary_mask_block)
+
+        attention_mask = (1 - torch.cat([torch.cat(mask) for mask in masks_to_concat])).to(torch.bool)
+        return attention_mask
+
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training and non-streaming inference.
+
+        B: batch size;
+        T: max number of input frames in batch;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): utterance frames right-padded with right context frames, with
+                shape `(B, T + right_context_length, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid utterance frames for i-th batch element in ``input``.
+
+        Returns:
+            (Tensor, Tensor):
+                Tensor
+                    output frames, with shape `(B, T, D)`.
+                Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid frames for i-th batch element in output frames.
+        """
+        input = input.permute(1, 0, 2)
+        right_context = self._gen_right_context(input)
+        utterance = input[: input.size(0) - self.right_context_length]
+        attention_mask = self._gen_attention_mask(utterance)
+        mems = (
+            self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[:-1]
+            if self.use_mem
+            else torch.empty(0).to(dtype=input.dtype, device=input.device)
+        )
+        output = utterance
+        for layer in self.emformer_layers:
+            output, right_context, mems = layer(output, lengths, right_context, mems, attention_mask)
+        return output.permute(1, 0, 2), lengths
+
+    @torch.jit.export
+    def infer(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        states: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass for streaming inference.
+
+        B: batch size;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): utterance frames right-padded with right context frames, with
+                shape `(B, segment_length + right_context_length, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            states (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation of ``infer``. (Default: ``None``)
+
+        Returns:
+            (Tensor, Tensor, List[List[Tensor]]):
+                Tensor
+                    output frames, with shape `(B, segment_length, D)`.
+                Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid frames for i-th batch element in output frames.
+                List[List[Tensor]]
+                    output states; list of lists of tensors representing internal state
+                    generated in current invocation of ``infer``.
+        """
+        if input.size(1) != self.segment_length + self.right_context_length:
+            raise ValueError(
+                "Per configured segment_length and right_context_length"
+                f", expected size of {self.segment_length + self.right_context_length} for dimension 1 of input"
+                f", but got {input.size(1)}."
+            )
+        input = input.permute(1, 0, 2)
+        right_context_start_idx = input.size(0) - self.right_context_length
+        right_context = input[right_context_start_idx:]
+        utterance = input[:right_context_start_idx]
+        output_lengths = torch.clamp(lengths - self.right_context_length, min=0)
+        mems = (
+            self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
+            if self.use_mem
+            else torch.empty(0).to(dtype=input.dtype, device=input.device)
+        )
+        output = utterance
+        output_states: List[List[torch.Tensor]] = []
+        for layer_idx, layer in enumerate(self.emformer_layers):
+            output, right_context, output_state, mems = layer.infer(
+                output,
+                output_lengths,
+                right_context,
+                None if states is None else states[layer_idx],
+                mems,
+            )
+            output_states.append(output_state)
+
+        return output.permute(1, 0, 2), output_lengths, output_states
+
+
+class Emformer(_EmformerImpl):
+    r"""Emformer architecture introduced in
+    *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition*
+    :cite:`shi2021emformer`.
+
+    See Also:
+        * :func:`~torchaudio.models.emformer_rnnt_model`,
+          :func:`~torchaudio.models.emformer_rnnt_base`: factory functions.
+        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipelines with pretrained model.
+
+    Args:
+        input_dim (int): input dimension.
+        num_heads (int): number of attention heads in each Emformer layer.
+        ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
+        num_layers (int): number of Emformer layers to instantiate.
+        segment_length (int): length of each input segment.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        activation (str, optional): activation function to use in each Emformer layer's
+            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
+        left_context_length (int, optional): length of left context. (Default: 0)
+        right_context_length (int, optional): length of right context. (Default: 0)
+        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
+        weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling
+            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
+        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
+        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
+
+    Examples:
+        >>> emformer = Emformer(512, 8, 2048, 20, 4, right_context_length=1)
+        >>> input = torch.rand(128, 400, 512)  # batch, num_frames, feature_dim
+        >>> lengths = torch.randint(1, 200, (128,))  # batch
+        >>> output, lengths = emformer(input, lengths)
+        >>> input = torch.rand(128, 5, 512)
+        >>> lengths = torch.ones(128) * 5
+        >>> output, lengths, states = emformer.infer(input, lengths, None)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        num_layers: int,
+        segment_length: int,
+        dropout: float = 0.0,
+        activation: str = "relu",
+        left_context_length: int = 0,
+        right_context_length: int = 0,
+        max_memory_size: int = 0,
+        weight_init_scale_strategy: Optional[str] = "depthwise",
+        tanh_on_mem: bool = False,
+        negative_inf: float = -1e8,
+    ):
+        weight_init_gains = _get_weight_init_gains(weight_init_scale_strategy, num_layers)
+        emformer_layers = torch.nn.ModuleList(
+            [
+                _EmformerLayer(
+                    input_dim,
+                    num_heads,
+                    ffn_dim,
+                    segment_length,
+                    dropout=dropout,
+                    activation=activation,
+                    left_context_length=left_context_length,
+                    max_memory_size=max_memory_size,
+                    weight_init_gain=weight_init_gains[layer_idx],
+                    tanh_on_mem=tanh_on_mem,
+                    negative_inf=negative_inf,
+                )
+                for layer_idx in range(num_layers)
+            ]
+        )
+        super().__init__(
+            emformer_layers,
+            segment_length,
+            left_context_length=left_context_length,
+            right_context_length=right_context_length,
+            max_memory_size=max_memory_size,
+        )
diff --git a/MLPY/Lib/site-packages/torchaudio/models/rnnt.py b/MLPY/Lib/site-packages/torchaudio/models/rnnt.py
new file mode 100644
index 0000000000000000000000000000000000000000..659c7b93442095ad3d7c86e38e328094ce552d0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/rnnt.py
@@ -0,0 +1,816 @@
+from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple
+
+import torch
+from torchaudio.models import Emformer
+
+
+__all__ = ["RNNT", "emformer_rnnt_base", "emformer_rnnt_model"]
+
+
+class _TimeReduction(torch.nn.Module):
+    r"""Coalesces frames along time dimension into a
+    fewer number of frames with higher feature dimensionality.
+
+    Args:
+        stride (int): number of frames to merge for each output frame.
+    """
+
+    def __init__(self, stride: int) -> None:
+        super().__init__()
+        self.stride = stride
+
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Forward pass.
+
+        B: batch size;
+        T: maximum input sequence length in batch;
+        D: feature dimension of each input sequence frame.
+
+        Args:
+            input (torch.Tensor): input sequences, with shape `(B, T, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    output sequences, with shape
+                    `(B, T  // stride, D * stride)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid frames for i-th batch element in output sequences.
+        """
+        B, T, D = input.shape
+        num_frames = T - (T % self.stride)
+        input = input[:, :num_frames, :]
+        lengths = lengths.div(self.stride, rounding_mode="trunc")
+        T_max = num_frames // self.stride
+
+        output = input.reshape(B, T_max, D * self.stride)
+        output = output.contiguous()
+        return output, lengths
+
+
+class _CustomLSTM(torch.nn.Module):
+    r"""Custom long-short-term memory (LSTM) block that applies layer normalization
+    to internal nodes.
+
+    Args:
+        input_dim (int): input dimension.
+        hidden_dim (int): hidden dimension.
+        layer_norm (bool, optional): if ``True``, enables layer normalization. (Default: ``False``)
+        layer_norm_epsilon (float, optional):  value of epsilon to use in
+            layer normalization layers (Default: 1e-5)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        layer_norm: bool = False,
+        layer_norm_epsilon: float = 1e-5,
+    ) -> None:
+        super().__init__()
+        self.x2g = torch.nn.Linear(input_dim, 4 * hidden_dim, bias=(not layer_norm))
+        self.p2g = torch.nn.Linear(hidden_dim, 4 * hidden_dim, bias=False)
+        if layer_norm:
+            self.c_norm = torch.nn.LayerNorm(hidden_dim, eps=layer_norm_epsilon)
+            self.g_norm = torch.nn.LayerNorm(4 * hidden_dim, eps=layer_norm_epsilon)
+        else:
+            self.c_norm = torch.nn.Identity()
+            self.g_norm = torch.nn.Identity()
+
+        self.hidden_dim = hidden_dim
+
+    def forward(
+        self, input: torch.Tensor, state: Optional[List[torch.Tensor]]
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        r"""Forward pass.
+
+        B: batch size;
+        T: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): with shape `(T, B, D)`.
+            state (List[torch.Tensor] or None): list of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``.
+
+        Returns:
+            (torch.Tensor, List[torch.Tensor]):
+                torch.Tensor
+                    output, with shape `(T, B, hidden_dim)`.
+                List[torch.Tensor]
+                    list of tensors representing internal state generated
+                    in current invocation of ``forward``.
+        """
+        if state is None:
+            B = input.size(1)
+            h = torch.zeros(B, self.hidden_dim, device=input.device, dtype=input.dtype)
+            c = torch.zeros(B, self.hidden_dim, device=input.device, dtype=input.dtype)
+        else:
+            h, c = state
+
+        gated_input = self.x2g(input)
+        outputs = []
+        for gates in gated_input.unbind(0):
+            gates = gates + self.p2g(h)
+            gates = self.g_norm(gates)
+            input_gate, forget_gate, cell_gate, output_gate = gates.chunk(4, 1)
+            input_gate = input_gate.sigmoid()
+            forget_gate = forget_gate.sigmoid()
+            cell_gate = cell_gate.tanh()
+            output_gate = output_gate.sigmoid()
+            c = forget_gate * c + input_gate * cell_gate
+            c = self.c_norm(c)
+            h = output_gate * c.tanh()
+            outputs.append(h)
+
+        output = torch.stack(outputs, dim=0)
+        state = [h, c]
+
+        return output, state
+
+
+class _Transcriber(ABC):
+    @abstractmethod
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        pass
+
+    @abstractmethod
+    def infer(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        states: Optional[List[List[torch.Tensor]]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        pass
+
+
+class _EmformerEncoder(torch.nn.Module, _Transcriber):
+    r"""Emformer-based recurrent neural network transducer (RNN-T) encoder (transcription network).
+
+    Args:
+        input_dim (int): feature dimension of each input sequence element.
+        output_dim (int): feature dimension of each output sequence element.
+        segment_length (int): length of input segment expressed as number of frames.
+        right_context_length (int): length of right context expressed as number of frames.
+        time_reduction_input_dim (int): dimension to scale each element in input sequences to
+            prior to applying time reduction block.
+        time_reduction_stride (int): factor by which to reduce length of input sequence.
+        transformer_num_heads (int): number of attention heads in each Emformer layer.
+        transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
+        transformer_num_layers (int): number of Emformer layers to instantiate.
+        transformer_left_context_length (int): length of left context.
+        transformer_dropout (float, optional): transformer dropout probability. (Default: 0.0)
+        transformer_activation (str, optional): activation function to use in each Emformer layer's
+            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
+        transformer_max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
+        transformer_weight_init_scale_strategy (str, optional): per-layer weight initialization scaling
+            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
+        transformer_tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
+    """
+
+    def __init__(
+        self,
+        *,
+        input_dim: int,
+        output_dim: int,
+        segment_length: int,
+        right_context_length: int,
+        time_reduction_input_dim: int,
+        time_reduction_stride: int,
+        transformer_num_heads: int,
+        transformer_ffn_dim: int,
+        transformer_num_layers: int,
+        transformer_left_context_length: int,
+        transformer_dropout: float = 0.0,
+        transformer_activation: str = "relu",
+        transformer_max_memory_size: int = 0,
+        transformer_weight_init_scale_strategy: str = "depthwise",
+        transformer_tanh_on_mem: bool = False,
+    ) -> None:
+        super().__init__()
+        self.input_linear = torch.nn.Linear(
+            input_dim,
+            time_reduction_input_dim,
+            bias=False,
+        )
+        self.time_reduction = _TimeReduction(time_reduction_stride)
+        transformer_input_dim = time_reduction_input_dim * time_reduction_stride
+        self.transformer = Emformer(
+            transformer_input_dim,
+            transformer_num_heads,
+            transformer_ffn_dim,
+            transformer_num_layers,
+            segment_length // time_reduction_stride,
+            dropout=transformer_dropout,
+            activation=transformer_activation,
+            left_context_length=transformer_left_context_length,
+            right_context_length=right_context_length // time_reduction_stride,
+            max_memory_size=transformer_max_memory_size,
+            weight_init_scale_strategy=transformer_weight_init_scale_strategy,
+            tanh_on_mem=transformer_tanh_on_mem,
+        )
+        self.output_linear = torch.nn.Linear(transformer_input_dim, output_dim)
+        self.layer_norm = torch.nn.LayerNorm(output_dim)
+
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum input sequence length in batch;
+        D: feature dimension of each input sequence frame (input_dim).
+
+        Args:
+            input (torch.Tensor): input frame sequences right-padded with right context, with
+                shape `(B, T + right context length, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    output frame sequences, with
+                    shape `(B, T // time_reduction_stride, output_dim)`.
+                torch.Tensor
+                    output input lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output frame sequences.
+        """
+        input_linear_out = self.input_linear(input)
+        time_reduction_out, time_reduction_lengths = self.time_reduction(input_linear_out, lengths)
+        transformer_out, transformer_lengths = self.transformer(time_reduction_out, time_reduction_lengths)
+        output_linear_out = self.output_linear(transformer_out)
+        layer_norm_out = self.layer_norm(output_linear_out)
+        return layer_norm_out, transformer_lengths
+
+    @torch.jit.export
+    def infer(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        states: Optional[List[List[torch.Tensor]]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass for inference.
+
+        B: batch size;
+        T: maximum input sequence segment length in batch;
+        D: feature dimension of each input sequence frame (input_dim).
+
+        Args:
+            input (torch.Tensor): input frame sequence segments right-padded with right context, with
+                shape `(B, T + right context length, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``infer``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output frame sequences, with
+                    shape `(B, T // time_reduction_stride, output_dim)`.
+                torch.Tensor
+                    output input lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation
+                    of ``infer``.
+        """
+        input_linear_out = self.input_linear(input)
+        time_reduction_out, time_reduction_lengths = self.time_reduction(input_linear_out, lengths)
+        (
+            transformer_out,
+            transformer_lengths,
+            transformer_states,
+        ) = self.transformer.infer(time_reduction_out, time_reduction_lengths, states)
+        output_linear_out = self.output_linear(transformer_out)
+        layer_norm_out = self.layer_norm(output_linear_out)
+        return layer_norm_out, transformer_lengths, transformer_states
+
+
+class _Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Args:
+        num_symbols (int): size of target token lexicon.
+        output_dim (int): feature dimension of each output sequence element.
+        symbol_embedding_dim (int): dimension of each target token embedding.
+        num_lstm_layers (int): number of LSTM layers to instantiate.
+        lstm_hidden_dim (int): output dimension of each LSTM layer.
+        lstm_layer_norm (bool, optional): if ``True``, enables layer normalization
+            for LSTM layers. (Default: ``False``)
+        lstm_layer_norm_epsilon (float, optional): value of epsilon to use in
+            LSTM layer normalization layers. (Default: 1e-5)
+        lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0)
+
+    """
+
+    def __init__(
+        self,
+        num_symbols: int,
+        output_dim: int,
+        symbol_embedding_dim: int,
+        num_lstm_layers: int,
+        lstm_hidden_dim: int,
+        lstm_layer_norm: bool = False,
+        lstm_layer_norm_epsilon: float = 1e-5,
+        lstm_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim)
+        self.input_layer_norm = torch.nn.LayerNorm(symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                _CustomLSTM(
+                    symbol_embedding_dim if idx == 0 else lstm_hidden_dim,
+                    lstm_hidden_dim,
+                    layer_norm=lstm_layer_norm,
+                    layer_norm_epsilon=lstm_layer_norm_epsilon,
+                )
+                for idx in range(num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=lstm_dropout)
+        self.linear = torch.nn.Linear(lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        input_tb = input.permute(1, 0)
+        embedding_out = self.embedding(input_tb)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(lstm_out, None if state is None else state[layer_idx])
+            lstm_out = self.dropout(lstm_out)
+            state_out.append(lstm_state_out)
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class _Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths
+
+
+class RNNT(torch.nn.Module):
+    r"""torchaudio.models.RNNT()
+
+    Recurrent neural network transducer (RNN-T) model.
+
+    Note:
+        To build the model, please use one of the factory functions.
+
+    See Also:
+        :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pre-trained models.
+
+    Args:
+        transcriber (torch.nn.Module): transcription network.
+        predictor (torch.nn.Module): prediction network.
+        joiner (torch.nn.Module): joint network.
+    """
+
+    def __init__(self, transcriber: _Transcriber, predictor: _Predictor, joiner: _Joiner) -> None:
+        super().__init__()
+        self.transcriber = transcriber
+        self.predictor = predictor
+        self.joiner = joiner
+
+    def forward(
+        self,
+        sources: torch.Tensor,
+        source_lengths: torch.Tensor,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+        predictor_state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: feature dimension of each source sequence element.
+
+        Args:
+            sources (torch.Tensor): source frame sequences right-padded with right context, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``sources``.
+            targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``targets``.
+            predictor_state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing prediction network internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    joint network output, with shape
+                    `(B, max output source length, max output target length, output_dim (number of target symbols))`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing prediction network internal state generated in current invocation
+                    of ``forward``.
+        """
+        source_encodings, source_lengths = self.transcriber(
+            input=sources,
+            lengths=source_lengths,
+        )
+        target_encodings, target_lengths, predictor_state = self.predictor(
+            input=targets,
+            lengths=target_lengths,
+            state=predictor_state,
+        )
+        output, source_lengths, target_lengths = self.joiner(
+            source_encodings=source_encodings,
+            source_lengths=source_lengths,
+            target_encodings=target_encodings,
+            target_lengths=target_lengths,
+        )
+
+        return (
+            output,
+            source_lengths,
+            target_lengths,
+            predictor_state,
+        )
+
+    @torch.jit.export
+    def transcribe_streaming(
+        self,
+        sources: torch.Tensor,
+        source_lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Applies transcription network to sources in streaming mode.
+
+        B: batch size;
+        T: maximum source sequence segment length in batch;
+        D: feature dimension of each source sequence frame.
+
+        Args:
+            sources (torch.Tensor): source frame sequence segments right-padded with right context, with
+                shape `(B, T + right context length, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``sources``.
+            state (List[List[torch.Tensor]] or None): list of lists of tensors
+                representing transcription network internal state generated in preceding invocation
+                of ``transcribe_streaming``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output frame sequences, with
+                    shape `(B, T // time_reduction_stride, output_dim)`.
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing transcription network internal state generated in current invocation
+                    of ``transcribe_streaming``.
+        """
+        return self.transcriber.infer(sources, source_lengths, state)
+
+    @torch.jit.export
+    def transcribe(
+        self,
+        sources: torch.Tensor,
+        source_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Applies transcription network to sources in non-streaming mode.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        D: feature dimension of each source sequence frame.
+
+        Args:
+            sources (torch.Tensor): source frame sequences right-padded with right context, with
+                shape `(B, T + right context length, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``sources``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    output frame sequences, with
+                    shape `(B, T // time_reduction_stride, output_dim)`.
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output frame sequences.
+        """
+        return self.transcriber(sources, source_lengths)
+
+    @torch.jit.export
+    def predict(
+        self,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Applies prediction network to targets.
+
+        B: batch size;
+        U: maximum target sequence length in batch;
+        D: feature dimension of each target sequence frame.
+
+        Args:
+            targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``targets``.
+            state (List[List[torch.Tensor]] or None): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``predict``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output frame sequences, with shape `(B, U, output_dim)`.
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``predict``.
+        """
+        return self.predictor(input=targets, lengths=target_lengths, state=state)
+
+    @torch.jit.export
+    def join(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Applies joint network to source and target encodings.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        output, source_lengths, target_lengths = self.joiner(
+            source_encodings=source_encodings,
+            source_lengths=source_lengths,
+            target_encodings=target_encodings,
+            target_lengths=target_lengths,
+        )
+        return output, source_lengths, target_lengths
+
+
+def emformer_rnnt_model(
+    *,
+    input_dim: int,
+    encoding_dim: int,
+    num_symbols: int,
+    segment_length: int,
+    right_context_length: int,
+    time_reduction_input_dim: int,
+    time_reduction_stride: int,
+    transformer_num_heads: int,
+    transformer_ffn_dim: int,
+    transformer_num_layers: int,
+    transformer_dropout: float,
+    transformer_activation: str,
+    transformer_left_context_length: int,
+    transformer_max_memory_size: int,
+    transformer_weight_init_scale_strategy: str,
+    transformer_tanh_on_mem: bool,
+    symbol_embedding_dim: int,
+    num_lstm_layers: int,
+    lstm_layer_norm: bool,
+    lstm_layer_norm_epsilon: float,
+    lstm_dropout: float,
+) -> RNNT:
+    r"""Builds Emformer-based :class:`~torchaudio.models.RNNT`.
+
+    Note:
+        For non-streaming inference, the expectation is for `transcribe` to be called on input
+        sequences right-concatenated with `right_context_length` frames.
+
+        For streaming inference, the expectation is for `transcribe_streaming` to be called
+        on input chunks comprising `segment_length` frames right-concatenated with `right_context_length`
+        frames.
+
+    Args:
+        input_dim (int): dimension of input sequence frames passed to transcription network.
+        encoding_dim (int): dimension of transcription- and prediction-network-generated encodings
+            passed to joint network.
+        num_symbols (int): cardinality of set of target tokens.
+        segment_length (int): length of input segment expressed as number of frames.
+        right_context_length (int): length of right context expressed as number of frames.
+        time_reduction_input_dim (int): dimension to scale each element in input sequences to
+            prior to applying time reduction block.
+        time_reduction_stride (int): factor by which to reduce length of input sequence.
+        transformer_num_heads (int): number of attention heads in each Emformer layer.
+        transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
+        transformer_num_layers (int): number of Emformer layers to instantiate.
+        transformer_left_context_length (int): length of left context considered by Emformer.
+        transformer_dropout (float): Emformer dropout probability.
+        transformer_activation (str): activation function to use in each Emformer layer's
+            feedforward network. Must be one of ("relu", "gelu", "silu").
+        transformer_max_memory_size (int): maximum number of memory elements to use.
+        transformer_weight_init_scale_strategy (str): per-layer weight initialization scaling
+            strategy. Must be one of ("depthwise", "constant", ``None``).
+        transformer_tanh_on_mem (bool): if ``True``, applies tanh to memory elements.
+        symbol_embedding_dim (int): dimension of each target token embedding.
+        num_lstm_layers (int): number of LSTM layers to instantiate.
+        lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers.
+        lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers.
+        lstm_dropout (float): LSTM dropout probability.
+
+    Returns:
+        RNNT:
+            Emformer RNN-T model.
+    """
+    encoder = _EmformerEncoder(
+        input_dim=input_dim,
+        output_dim=encoding_dim,
+        segment_length=segment_length,
+        right_context_length=right_context_length,
+        time_reduction_input_dim=time_reduction_input_dim,
+        time_reduction_stride=time_reduction_stride,
+        transformer_num_heads=transformer_num_heads,
+        transformer_ffn_dim=transformer_ffn_dim,
+        transformer_num_layers=transformer_num_layers,
+        transformer_dropout=transformer_dropout,
+        transformer_activation=transformer_activation,
+        transformer_left_context_length=transformer_left_context_length,
+        transformer_max_memory_size=transformer_max_memory_size,
+        transformer_weight_init_scale_strategy=transformer_weight_init_scale_strategy,
+        transformer_tanh_on_mem=transformer_tanh_on_mem,
+    )
+    predictor = _Predictor(
+        num_symbols,
+        encoding_dim,
+        symbol_embedding_dim=symbol_embedding_dim,
+        num_lstm_layers=num_lstm_layers,
+        lstm_hidden_dim=symbol_embedding_dim,
+        lstm_layer_norm=lstm_layer_norm,
+        lstm_layer_norm_epsilon=lstm_layer_norm_epsilon,
+        lstm_dropout=lstm_dropout,
+    )
+    joiner = _Joiner(encoding_dim, num_symbols)
+    return RNNT(encoder, predictor, joiner)
+
+
+def emformer_rnnt_base(num_symbols: int) -> RNNT:
+    r"""Builds basic version of Emformer-based :class:`~torchaudio.models.RNNT`.
+
+    Args:
+        num_symbols (int): The size of target token lexicon.
+
+    Returns:
+        RNNT:
+            Emformer RNN-T model.
+    """
+    return emformer_rnnt_model(
+        input_dim=80,
+        encoding_dim=1024,
+        num_symbols=num_symbols,
+        segment_length=16,
+        right_context_length=4,
+        time_reduction_input_dim=128,
+        time_reduction_stride=4,
+        transformer_num_heads=8,
+        transformer_ffn_dim=2048,
+        transformer_num_layers=20,
+        transformer_dropout=0.1,
+        transformer_activation="gelu",
+        transformer_left_context_length=30,
+        transformer_max_memory_size=0,
+        transformer_weight_init_scale_strategy="depthwise",
+        transformer_tanh_on_mem=True,
+        symbol_embedding_dim=512,
+        num_lstm_layers=3,
+        lstm_layer_norm=True,
+        lstm_layer_norm_epsilon=1e-3,
+        lstm_dropout=0.3,
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/models/rnnt_decoder.py b/MLPY/Lib/site-packages/torchaudio/models/rnnt_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fe03715513a077984f5c3d4fcf95e0fa653b5f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/rnnt_decoder.py
@@ -0,0 +1,339 @@
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+from torchaudio.models import RNNT
+
+
+__all__ = ["Hypothesis", "RNNTBeamSearch"]
+
+
+Hypothesis = Tuple[List[int], torch.Tensor, List[List[torch.Tensor]], float]
+Hypothesis.__doc__ = """Hypothesis generated by RNN-T beam search decoder,
+    represented as tuple of (tokens, prediction network output, prediction network state, score).
+    """
+
+
+def _get_hypo_tokens(hypo: Hypothesis) -> List[int]:
+    return hypo[0]
+
+
+def _get_hypo_predictor_out(hypo: Hypothesis) -> torch.Tensor:
+    return hypo[1]
+
+
+def _get_hypo_state(hypo: Hypothesis) -> List[List[torch.Tensor]]:
+    return hypo[2]
+
+
+def _get_hypo_score(hypo: Hypothesis) -> float:
+    return hypo[3]
+
+
+def _get_hypo_key(hypo: Hypothesis) -> str:
+    return str(hypo[0])
+
+
+def _batch_state(hypos: List[Hypothesis]) -> List[List[torch.Tensor]]:
+    states: List[List[torch.Tensor]] = []
+    for i in range(len(_get_hypo_state(hypos[0]))):
+        batched_state_components: List[torch.Tensor] = []
+        for j in range(len(_get_hypo_state(hypos[0])[i])):
+            batched_state_components.append(torch.cat([_get_hypo_state(hypo)[i][j] for hypo in hypos]))
+        states.append(batched_state_components)
+    return states
+
+
+def _slice_state(states: List[List[torch.Tensor]], idx: int, device: torch.device) -> List[List[torch.Tensor]]:
+    idx_tensor = torch.tensor([idx], device=device)
+    return [[state.index_select(0, idx_tensor) for state in state_tuple] for state_tuple in states]
+
+
+def _default_hypo_sort_key(hypo: Hypothesis) -> float:
+    return _get_hypo_score(hypo) / (len(_get_hypo_tokens(hypo)) + 1)
+
+
+def _compute_updated_scores(
+    hypos: List[Hypothesis],
+    next_token_probs: torch.Tensor,
+    beam_width: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    hypo_scores = torch.tensor([_get_hypo_score(h) for h in hypos]).unsqueeze(1)
+    nonblank_scores = hypo_scores + next_token_probs[:, :-1]  # [beam_width, num_tokens - 1]
+    nonblank_nbest_scores, nonblank_nbest_idx = nonblank_scores.reshape(-1).topk(beam_width)
+    nonblank_nbest_hypo_idx = nonblank_nbest_idx.div(nonblank_scores.shape[1], rounding_mode="trunc")
+    nonblank_nbest_token = nonblank_nbest_idx % nonblank_scores.shape[1]
+    return nonblank_nbest_scores, nonblank_nbest_hypo_idx, nonblank_nbest_token
+
+
+def _remove_hypo(hypo: Hypothesis, hypo_list: List[Hypothesis]) -> None:
+    for i, elem in enumerate(hypo_list):
+        if _get_hypo_key(hypo) == _get_hypo_key(elem):
+            del hypo_list[i]
+            break
+
+
+class RNNTBeamSearch(torch.nn.Module):
+    r"""Beam search decoder for RNN-T model.
+
+    See Also:
+        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pretrained model.
+
+    Args:
+        model (RNNT): RNN-T model to use.
+        blank (int): index of blank token in vocabulary.
+        temperature (float, optional): temperature to apply to joint network output.
+            Larger values yield more uniform samples. (Default: 1.0)
+        hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score
+            for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns
+            hypothesis score normalized by token sequence length. (Default: None)
+        step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100)
+    """
+
+    def __init__(
+        self,
+        model: RNNT,
+        blank: int,
+        temperature: float = 1.0,
+        hypo_sort_key: Optional[Callable[[Hypothesis], float]] = None,
+        step_max_tokens: int = 100,
+    ) -> None:
+        super().__init__()
+        self.model = model
+        self.blank = blank
+        self.temperature = temperature
+
+        if hypo_sort_key is None:
+            self.hypo_sort_key = _default_hypo_sort_key
+        else:
+            self.hypo_sort_key = hypo_sort_key
+
+        self.step_max_tokens = step_max_tokens
+
+    def _init_b_hypos(self, device: torch.device) -> List[Hypothesis]:
+        token = self.blank
+        state = None
+
+        one_tensor = torch.tensor([1], device=device)
+        pred_out, _, pred_state = self.model.predict(torch.tensor([[token]], device=device), one_tensor, state)
+        init_hypo = (
+            [token],
+            pred_out[0].detach(),
+            pred_state,
+            0.0,
+        )
+        return [init_hypo]
+
+    def _gen_next_token_probs(
+        self, enc_out: torch.Tensor, hypos: List[Hypothesis], device: torch.device
+    ) -> torch.Tensor:
+        one_tensor = torch.tensor([1], device=device)
+        predictor_out = torch.stack([_get_hypo_predictor_out(h) for h in hypos], dim=0)
+        joined_out, _, _ = self.model.join(
+            enc_out,
+            one_tensor,
+            predictor_out,
+            torch.tensor([1] * len(hypos), device=device),
+        )  # [beam_width, 1, 1, num_tokens]
+        joined_out = torch.nn.functional.log_softmax(joined_out / self.temperature, dim=3)
+        return joined_out[:, 0, 0]
+
+    def _gen_b_hypos(
+        self,
+        b_hypos: List[Hypothesis],
+        a_hypos: List[Hypothesis],
+        next_token_probs: torch.Tensor,
+        key_to_b_hypo: Dict[str, Hypothesis],
+    ) -> List[Hypothesis]:
+        for i in range(len(a_hypos)):
+            h_a = a_hypos[i]
+            append_blank_score = _get_hypo_score(h_a) + next_token_probs[i, -1]
+            if _get_hypo_key(h_a) in key_to_b_hypo:
+                h_b = key_to_b_hypo[_get_hypo_key(h_a)]
+                _remove_hypo(h_b, b_hypos)
+                score = float(torch.tensor(_get_hypo_score(h_b)).logaddexp(append_blank_score))
+            else:
+                score = float(append_blank_score)
+            h_b = (
+                _get_hypo_tokens(h_a),
+                _get_hypo_predictor_out(h_a),
+                _get_hypo_state(h_a),
+                score,
+            )
+            b_hypos.append(h_b)
+            key_to_b_hypo[_get_hypo_key(h_b)] = h_b
+        _, sorted_idx = torch.tensor([_get_hypo_score(hypo) for hypo in b_hypos]).sort()
+        return [b_hypos[idx] for idx in sorted_idx]
+
+    def _gen_a_hypos(
+        self,
+        a_hypos: List[Hypothesis],
+        b_hypos: List[Hypothesis],
+        next_token_probs: torch.Tensor,
+        t: int,
+        beam_width: int,
+        device: torch.device,
+    ) -> List[Hypothesis]:
+        (
+            nonblank_nbest_scores,
+            nonblank_nbest_hypo_idx,
+            nonblank_nbest_token,
+        ) = _compute_updated_scores(a_hypos, next_token_probs, beam_width)
+
+        if len(b_hypos) < beam_width:
+            b_nbest_score = -float("inf")
+        else:
+            b_nbest_score = _get_hypo_score(b_hypos[-beam_width])
+
+        base_hypos: List[Hypothesis] = []
+        new_tokens: List[int] = []
+        new_scores: List[float] = []
+        for i in range(beam_width):
+            score = float(nonblank_nbest_scores[i])
+            if score > b_nbest_score:
+                a_hypo_idx = int(nonblank_nbest_hypo_idx[i])
+                base_hypos.append(a_hypos[a_hypo_idx])
+                new_tokens.append(int(nonblank_nbest_token[i]))
+                new_scores.append(score)
+
+        if base_hypos:
+            new_hypos = self._gen_new_hypos(base_hypos, new_tokens, new_scores, t, device)
+        else:
+            new_hypos: List[Hypothesis] = []
+
+        return new_hypos
+
+    def _gen_new_hypos(
+        self,
+        base_hypos: List[Hypothesis],
+        tokens: List[int],
+        scores: List[float],
+        t: int,
+        device: torch.device,
+    ) -> List[Hypothesis]:
+        tgt_tokens = torch.tensor([[token] for token in tokens], device=device)
+        states = _batch_state(base_hypos)
+        pred_out, _, pred_states = self.model.predict(
+            tgt_tokens,
+            torch.tensor([1] * len(base_hypos), device=device),
+            states,
+        )
+        new_hypos: List[Hypothesis] = []
+        for i, h_a in enumerate(base_hypos):
+            new_tokens = _get_hypo_tokens(h_a) + [tokens[i]]
+            new_hypos.append((new_tokens, pred_out[i].detach(), _slice_state(pred_states, i, device), scores[i]))
+        return new_hypos
+
+    def _search(
+        self,
+        enc_out: torch.Tensor,
+        hypo: Optional[List[Hypothesis]],
+        beam_width: int,
+    ) -> List[Hypothesis]:
+        n_time_steps = enc_out.shape[1]
+        device = enc_out.device
+
+        a_hypos: List[Hypothesis] = []
+        b_hypos = self._init_b_hypos(device) if hypo is None else hypo
+        for t in range(n_time_steps):
+            a_hypos = b_hypos
+            b_hypos = torch.jit.annotate(List[Hypothesis], [])
+            key_to_b_hypo: Dict[str, Hypothesis] = {}
+            symbols_current_t = 0
+
+            while a_hypos:
+                next_token_probs = self._gen_next_token_probs(enc_out[:, t : t + 1], a_hypos, device)
+                next_token_probs = next_token_probs.cpu()
+                b_hypos = self._gen_b_hypos(b_hypos, a_hypos, next_token_probs, key_to_b_hypo)
+
+                if symbols_current_t == self.step_max_tokens:
+                    break
+
+                a_hypos = self._gen_a_hypos(
+                    a_hypos,
+                    b_hypos,
+                    next_token_probs,
+                    t,
+                    beam_width,
+                    device,
+                )
+                if a_hypos:
+                    symbols_current_t += 1
+
+            _, sorted_idx = torch.tensor([self.hypo_sort_key(hyp) for hyp in b_hypos]).topk(beam_width)
+            b_hypos = [b_hypos[idx] for idx in sorted_idx]
+
+        return b_hypos
+
+    def forward(self, input: torch.Tensor, length: torch.Tensor, beam_width: int) -> List[Hypothesis]:
+        r"""Performs beam search for the given input sequence.
+
+        T: number of frames;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
+            length (torch.Tensor): number of valid frames in input
+                sequence, with shape () or (1,).
+            beam_width (int): beam size to use during search.
+
+        Returns:
+            List[Hypothesis]: top-``beam_width`` hypotheses found by beam search.
+        """
+        if input.dim() != 2 and not (input.dim() == 3 and input.shape[0] == 1):
+            raise ValueError("input must be of shape (T, D) or (1, T, D)")
+        if input.dim() == 2:
+            input = input.unsqueeze(0)
+
+        if length.shape != () and length.shape != (1,):
+            raise ValueError("length must be of shape () or (1,)")
+        if length.dim() == 0:
+            length = length.unsqueeze(0)
+
+        enc_out, _ = self.model.transcribe(input, length)
+        return self._search(enc_out, None, beam_width)
+
+    @torch.jit.export
+    def infer(
+        self,
+        input: torch.Tensor,
+        length: torch.Tensor,
+        beam_width: int,
+        state: Optional[List[List[torch.Tensor]]] = None,
+        hypothesis: Optional[List[Hypothesis]] = None,
+    ) -> Tuple[List[Hypothesis], List[List[torch.Tensor]]]:
+        r"""Performs beam search for the given input sequence in streaming mode.
+
+        T: number of frames;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
+            length (torch.Tensor): number of valid frames in input
+                sequence, with shape () or (1,).
+            beam_width (int): beam size to use during search.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing transcription network internal state generated in preceding
+                invocation. (Default: ``None``)
+            hypothesis (List[Hypothesis] or None): hypotheses from preceding invocation to seed
+                search with. (Default: ``None``)
+
+        Returns:
+            (List[Hypothesis], List[List[torch.Tensor]]):
+                List[Hypothesis]
+                    top-``beam_width`` hypotheses found by beam search.
+                List[List[torch.Tensor]]
+                    list of lists of tensors representing transcription network
+                    internal state generated in current invocation.
+        """
+        if input.dim() != 2 and not (input.dim() == 3 and input.shape[0] == 1):
+            raise ValueError("input must be of shape (T, D) or (1, T, D)")
+        if input.dim() == 2:
+            input = input.unsqueeze(0)
+
+        if length.shape != () and length.shape != (1,):
+            raise ValueError("length must be of shape () or (1,)")
+        if length.dim() == 0:
+            length = length.unsqueeze(0)
+
+        enc_out, _, state = self.model.transcribe_streaming(input, length, state)
+        return self._search(enc_out, hypothesis, beam_width), state
diff --git a/MLPY/Lib/site-packages/torchaudio/models/squim/__init__.py b/MLPY/Lib/site-packages/torchaudio/models/squim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d52102f153c973cebd9215f27481a9ec1b415139
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/squim/__init__.py
@@ -0,0 +1,11 @@
+from .objective import squim_objective_base, squim_objective_model, SquimObjective
+from .subjective import squim_subjective_base, squim_subjective_model, SquimSubjective
+
+__all__ = [
+    "squim_objective_base",
+    "squim_objective_model",
+    "squim_subjective_base",
+    "squim_subjective_model",
+    "SquimObjective",
+    "SquimSubjective",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/models/squim/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/squim/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa892cf12ffe33a7598bc90621bab76341110022
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/squim/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/squim/__pycache__/objective.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/squim/__pycache__/objective.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1dbdc4448d1cd94385a510c2cd87ac21a5e0f8ba
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/squim/__pycache__/objective.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/squim/__pycache__/subjective.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/squim/__pycache__/subjective.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7128d74b4ac9e09bff4f5925d0eb46c12f5ff814
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/squim/__pycache__/subjective.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/squim/objective.py b/MLPY/Lib/site-packages/torchaudio/models/squim/objective.py
new file mode 100644
index 0000000000000000000000000000000000000000..83155e7f3fb8c1cc5592fc8d2ed75fe9e03cdb28
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/squim/objective.py
@@ -0,0 +1,326 @@
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def transform_wb_pesq_range(x: float) -> float:
+    """The metric defined by ITU-T P.862 is often called 'PESQ score', which is defined
+    for narrow-band signals and has a value range of [-0.5, 4.5] exactly. Here, we use the metric
+    defined by ITU-T P.862.2, commonly known as 'wide-band PESQ' and will be referred to as "PESQ score".
+
+    Args:
+        x (float): Narrow-band PESQ score.
+
+    Returns:
+        (float): Wide-band PESQ score.
+    """
+    return 0.999 + (4.999 - 0.999) / (1 + math.exp(-1.3669 * x + 3.8224))
+
+
+PESQRange: Tuple[float, float] = (
+    1.0,  # P.862.2 uses a different input filter than P.862, and the lower bound of
+    # the raw score is not -0.5 anymore. It's hard to figure out the true lower bound.
+    # We are using 1.0 as a reasonable approximation.
+    transform_wb_pesq_range(4.5),
+)
+
+
+class RangeSigmoid(nn.Module):
+    def __init__(self, val_range: Tuple[float, float] = (0.0, 1.0)) -> None:
+        super(RangeSigmoid, self).__init__()
+        assert isinstance(val_range, tuple) and len(val_range) == 2
+        self.val_range: Tuple[float, float] = val_range
+        self.sigmoid: nn.modules.Module = nn.Sigmoid()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = self.sigmoid(x) * (self.val_range[1] - self.val_range[0]) + self.val_range[0]
+        return out
+
+
+class Encoder(nn.Module):
+    """Encoder module that transform 1D waveform to 2D representations.
+
+    Args:
+        feat_dim (int, optional): The feature dimension after Encoder module. (Default: 512)
+        win_len (int, optional): kernel size in the Conv1D layer. (Default: 32)
+    """
+
+    def __init__(self, feat_dim: int = 512, win_len: int = 32) -> None:
+        super(Encoder, self).__init__()
+
+        self.conv1d = nn.Conv1d(1, feat_dim, win_len, stride=win_len // 2, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply waveforms to convolutional layer and ReLU layer.
+
+        Args:
+            x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`.
+
+        Returns:
+            (torch,Tensor): Feature Tensor with dimensions `(batch, channel, frame)`.
+        """
+        out = x.unsqueeze(dim=1)
+        out = F.relu(self.conv1d(out))
+        return out
+
+
+class SingleRNN(nn.Module):
+    def __init__(self, rnn_type: str, input_size: int, hidden_size: int, dropout: float = 0.0) -> None:
+        super(SingleRNN, self).__init__()
+
+        self.rnn_type = rnn_type
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+
+        self.rnn: nn.modules.Module = getattr(nn, rnn_type)(
+            input_size,
+            hidden_size,
+            1,
+            dropout=dropout,
+            batch_first=True,
+            bidirectional=True,
+        )
+
+        self.proj = nn.Linear(hidden_size * 2, input_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # input shape: batch, seq, dim
+        out, _ = self.rnn(x)
+        out = self.proj(out)
+        return out
+
+
+class DPRNN(nn.Module):
+    """*Dual-path recurrent neural networks (DPRNN)* :cite:`luo2020dual`.
+
+    Args:
+        feat_dim (int, optional): The feature dimension after Encoder module. (Default: 64)
+        hidden_dim (int, optional): Hidden dimension in the RNN layer of DPRNN. (Default: 128)
+        num_blocks (int, optional): Number of DPRNN layers. (Default: 6)
+        rnn_type (str, optional): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"]. (Default: "LSTM")
+        d_model (int, optional): The number of expected features in the input. (Default: 256)
+        chunk_size (int, optional): Chunk size of input for DPRNN. (Default: 100)
+        chunk_stride (int, optional): Stride of chunk input for DPRNN. (Default: 50)
+    """
+
+    def __init__(
+        self,
+        feat_dim: int = 64,
+        hidden_dim: int = 128,
+        num_blocks: int = 6,
+        rnn_type: str = "LSTM",
+        d_model: int = 256,
+        chunk_size: int = 100,
+        chunk_stride: int = 50,
+    ) -> None:
+        super(DPRNN, self).__init__()
+
+        self.num_blocks = num_blocks
+
+        self.row_rnn = nn.ModuleList([])
+        self.col_rnn = nn.ModuleList([])
+        self.row_norm = nn.ModuleList([])
+        self.col_norm = nn.ModuleList([])
+        for _ in range(num_blocks):
+            self.row_rnn.append(SingleRNN(rnn_type, feat_dim, hidden_dim))
+            self.col_rnn.append(SingleRNN(rnn_type, feat_dim, hidden_dim))
+            self.row_norm.append(nn.GroupNorm(1, feat_dim, eps=1e-8))
+            self.col_norm.append(nn.GroupNorm(1, feat_dim, eps=1e-8))
+        self.conv = nn.Sequential(
+            nn.Conv2d(feat_dim, d_model, 1),
+            nn.PReLU(),
+        )
+        self.chunk_size = chunk_size
+        self.chunk_stride = chunk_stride
+
+    def pad_chunk(self, x: torch.Tensor) -> Tuple[torch.Tensor, int]:
+        # input shape: (B, N, T)
+        seq_len = x.shape[-1]
+
+        rest = self.chunk_size - (self.chunk_stride + seq_len % self.chunk_size) % self.chunk_size
+        out = F.pad(x, [self.chunk_stride, rest + self.chunk_stride])
+
+        return out, rest
+
+    def chunking(self, x: torch.Tensor) -> Tuple[torch.Tensor, int]:
+        out, rest = self.pad_chunk(x)
+        batch_size, feat_dim, seq_len = out.shape
+
+        segments1 = out[:, :, : -self.chunk_stride].contiguous().view(batch_size, feat_dim, -1, self.chunk_size)
+        segments2 = out[:, :, self.chunk_stride :].contiguous().view(batch_size, feat_dim, -1, self.chunk_size)
+        out = torch.cat([segments1, segments2], dim=3)
+        out = out.view(batch_size, feat_dim, -1, self.chunk_size).transpose(2, 3).contiguous()
+
+        return out, rest
+
+    def merging(self, x: torch.Tensor, rest: int) -> torch.Tensor:
+        batch_size, dim, _, _ = x.shape
+        out = x.transpose(2, 3).contiguous().view(batch_size, dim, -1, self.chunk_size * 2)
+        out1 = out[:, :, :, : self.chunk_size].contiguous().view(batch_size, dim, -1)[:, :, self.chunk_stride :]
+        out2 = out[:, :, :, self.chunk_size :].contiguous().view(batch_size, dim, -1)[:, :, : -self.chunk_stride]
+        out = out1 + out2
+        if rest > 0:
+            out = out[:, :, :-rest]
+        out = out.contiguous()
+        return out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, rest = self.chunking(x)
+        batch_size, _, dim1, dim2 = x.shape
+        out = x
+        for row_rnn, row_norm, col_rnn, col_norm in zip(self.row_rnn, self.row_norm, self.col_rnn, self.col_norm):
+            row_in = out.permute(0, 3, 2, 1).contiguous().view(batch_size * dim2, dim1, -1).contiguous()
+            row_out = row_rnn(row_in)
+            row_out = row_out.view(batch_size, dim2, dim1, -1).permute(0, 3, 2, 1).contiguous()
+            row_out = row_norm(row_out)
+            out = out + row_out
+
+            col_in = out.permute(0, 2, 3, 1).contiguous().view(batch_size * dim1, dim2, -1).contiguous()
+            col_out = col_rnn(col_in)
+            col_out = col_out.view(batch_size, dim1, dim2, -1).permute(0, 3, 1, 2).contiguous()
+            col_out = col_norm(col_out)
+            out = out + col_out
+        out = self.conv(out)
+        out = self.merging(out, rest)
+        out = out.transpose(1, 2).contiguous()
+        return out
+
+
+class AutoPool(nn.Module):
+    def __init__(self, pool_dim: int = 1) -> None:
+        super(AutoPool, self).__init__()
+        self.pool_dim: int = pool_dim
+        self.softmax: nn.modules.Module = nn.Softmax(dim=pool_dim)
+        self.register_parameter("alpha", nn.Parameter(torch.ones(1)))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        weight = self.softmax(torch.mul(x, self.alpha))
+        out = torch.sum(torch.mul(x, weight), dim=self.pool_dim)
+        return out
+
+
+class SquimObjective(nn.Module):
+    """Speech Quality and Intelligibility Measures (SQUIM) model that predicts **objective** metric scores
+    for speech enhancement (e.g., STOI, PESQ, and SI-SDR).
+
+    Args:
+        encoder (torch.nn.Module): Encoder module to transform 1D waveform to 2D feature representation.
+        dprnn (torch.nn.Module): DPRNN module to model sequential feature.
+        branches (torch.nn.ModuleList): Transformer branches in which each branch estimate one objective metirc score.
+    """
+
+    def __init__(
+        self,
+        encoder: nn.Module,
+        dprnn: nn.Module,
+        branches: nn.ModuleList,
+    ):
+        super(SquimObjective, self).__init__()
+        self.encoder = encoder
+        self.dprnn = dprnn
+        self.branches = branches
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        """
+        Args:
+            x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`.
+
+        Returns:
+            List(torch.Tensor): List of score Tenosrs. Each Tensor is with dimension `(batch,)`.
+        """
+        if x.ndim != 2:
+            raise ValueError(f"The input must be a 2D Tensor. Found dimension {x.ndim}.")
+        x = x / (torch.mean(x**2, dim=1, keepdim=True) ** 0.5 * 20)
+        out = self.encoder(x)
+        out = self.dprnn(out)
+        scores = []
+        for branch in self.branches:
+            scores.append(branch(out).squeeze(dim=1))
+        return scores
+
+
+def _create_branch(d_model: int, nhead: int, metric: str) -> nn.modules.Module:
+    """Create branch module after DPRNN model for predicting metric score.
+
+    Args:
+        d_model (int): The number of expected features in the input.
+        nhead (int): Number of heads in the multi-head attention model.
+        metric (str): The metric name to predict.
+
+    Returns:
+        (nn.Module): Returned module to predict corresponding metric score.
+    """
+    layer1 = nn.TransformerEncoderLayer(d_model, nhead, d_model * 4, dropout=0.0, batch_first=True)
+    layer2 = AutoPool()
+    if metric == "stoi":
+        layer3 = nn.Sequential(
+            nn.Linear(d_model, d_model),
+            nn.PReLU(),
+            nn.Linear(d_model, 1),
+            RangeSigmoid(),
+        )
+    elif metric == "pesq":
+        layer3 = nn.Sequential(
+            nn.Linear(d_model, d_model),
+            nn.PReLU(),
+            nn.Linear(d_model, 1),
+            RangeSigmoid(val_range=PESQRange),
+        )
+    else:
+        layer3: nn.modules.Module = nn.Sequential(nn.Linear(d_model, d_model), nn.PReLU(), nn.Linear(d_model, 1))
+    return nn.Sequential(layer1, layer2, layer3)
+
+
+def squim_objective_model(
+    feat_dim: int,
+    win_len: int,
+    d_model: int,
+    nhead: int,
+    hidden_dim: int,
+    num_blocks: int,
+    rnn_type: str,
+    chunk_size: int,
+    chunk_stride: Optional[int] = None,
+) -> SquimObjective:
+    """Build a custome :class:`torchaudio.prototype.models.SquimObjective` model.
+
+    Args:
+        feat_dim (int, optional): The feature dimension after Encoder module.
+        win_len (int): Kernel size in the Encoder module.
+        d_model (int): The number of expected features in the input.
+        nhead (int): Number of heads in the multi-head attention model.
+        hidden_dim (int): Hidden dimension in the RNN layer of DPRNN.
+        num_blocks (int): Number of DPRNN layers.
+        rnn_type (str): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"].
+        chunk_size (int): Chunk size of input for DPRNN.
+        chunk_stride (int or None, optional): Stride of chunk input for DPRNN.
+    """
+    if chunk_stride is None:
+        chunk_stride = chunk_size // 2
+    encoder = Encoder(feat_dim, win_len)
+    dprnn = DPRNN(feat_dim, hidden_dim, num_blocks, rnn_type, d_model, chunk_size, chunk_stride)
+    branches = nn.ModuleList(
+        [
+            _create_branch(d_model, nhead, "stoi"),
+            _create_branch(d_model, nhead, "pesq"),
+            _create_branch(d_model, nhead, "sisdr"),
+        ]
+    )
+    return SquimObjective(encoder, dprnn, branches)
+
+
+def squim_objective_base() -> SquimObjective:
+    """Build :class:`torchaudio.prototype.models.SquimObjective` model with default arguments."""
+    return squim_objective_model(
+        feat_dim=256,
+        win_len=64,
+        d_model=256,
+        nhead=4,
+        hidden_dim=256,
+        num_blocks=2,
+        rnn_type="LSTM",
+        chunk_size=71,
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/models/squim/subjective.py b/MLPY/Lib/site-packages/torchaudio/models/squim/subjective.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3cc8ba3fc60e73351049ec1317ef3ddb050f70e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/squim/subjective.py
@@ -0,0 +1,150 @@
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torchaudio
+
+
+class AttPool(nn.Module):
+    """Attention-Pooling module that estimates the attention score.
+
+    Args:
+        input_dim (int): Input feature dimension.
+        att_dim (int): Attention Tensor dimension.
+    """
+
+    def __init__(self, input_dim: int, att_dim: int):
+        super(AttPool, self).__init__()
+
+        self.linear1 = nn.Linear(input_dim, 1)
+        self.linear2 = nn.Linear(input_dim, att_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply attention and pooling.
+
+        Args:
+            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.
+
+        Returns:
+            (torch.Tensor): Attention score with dimensions `(batch, att_dim)`.
+        """
+
+        att = self.linear1(x)  # (batch, time, 1)
+        att = att.transpose(2, 1)  # (batch, 1, time)
+        att = nn.functional.softmax(att, dim=2)
+        x = torch.matmul(att, x).squeeze(1)  # (batch, input_dim)
+        x = self.linear2(x)  # (batch, att_dim)
+        return x
+
+
+class Predictor(nn.Module):
+    """Prediction module that apply pooling and attention, then predict subjective metric scores.
+
+    Args:
+        input_dim (int): Input feature dimension.
+        att_dim (int): Attention Tensor dimension.
+    """
+
+    def __init__(self, input_dim: int, att_dim: int):
+        super(Predictor, self).__init__()
+        self.att_pool_layer = AttPool(input_dim, att_dim)
+        self.att_dim = att_dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Predict subjective evaluation metric score.
+
+        Args:
+            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.
+
+        Returns:
+            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
+        """
+        x = self.att_pool_layer(x)
+        x = nn.functional.softmax(x, dim=1)
+        B = torch.linspace(0, 4, steps=self.att_dim, device=x.device)
+        x = (x * B).sum(dim=1)
+        return x
+
+
+class SquimSubjective(nn.Module):
+    """Speech Quality and Intelligibility Measures (SQUIM) model that predicts **subjective** metric scores
+    for speech enhancement (e.g., Mean Opinion Score (MOS)). The model is adopted from *NORESQA-MOS*
+    :cite:`manocha2022speech` which predicts MOS scores given the input speech and a non-matching reference.
+
+    Args:
+        ssl_model (torch.nn.Module): The self-supervised learning model for feature extraction.
+        projector (torch.nn.Module): Projection layer that projects SSL feature to a lower dimension.
+        predictor (torch.nn.Module): Predict the subjective scores.
+    """
+
+    def __init__(self, ssl_model: nn.Module, projector: nn.Module, predictor: nn.Module):
+        super(SquimSubjective, self).__init__()
+        self.ssl_model = ssl_model
+        self.projector = projector
+        self.predictor = predictor
+
+    def _align_shapes(self, waveform: torch.Tensor, reference: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Cut or pad the reference Tensor to make it aligned with waveform Tensor.
+
+        Args:
+            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
+            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.
+
+        Returns:
+            (torch.Tensor, torch.Tensor): The aligned waveform and reference Tensors
+                with same dimensions `(batch, time)`.
+        """
+        T_waveform = waveform.shape[-1]
+        T_reference = reference.shape[-1]
+        if T_reference < T_waveform:
+            num_padding = T_waveform // T_reference + 1
+            reference = torch.cat([reference for _ in range(num_padding)], dim=1)
+        return waveform, reference[:, :T_waveform]
+
+    def forward(self, waveform: torch.Tensor, reference: torch.Tensor):
+        """Predict subjective evaluation metric score.
+
+        Args:
+            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
+            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.
+
+        Returns:
+            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
+        """
+        waveform, reference = self._align_shapes(waveform, reference)
+        waveform = self.projector(self.ssl_model.extract_features(waveform)[0][-1])
+        reference = self.projector(self.ssl_model.extract_features(reference)[0][-1])
+        concat = torch.cat((reference, waveform), dim=2)
+        score_diff = self.predictor(concat)  # Score difference compared to the reference
+        return 5 - score_diff
+
+
+def squim_subjective_model(
+    ssl_type: str,
+    feat_dim: int,
+    proj_dim: int,
+    att_dim: int,
+) -> SquimSubjective:
+    """Build a custome :class:`torchaudio.prototype.models.SquimSubjective` model.
+
+    Args:
+        ssl_type (str): Type of self-supervised learning (SSL) models.
+            Must be one of ["wav2vec2_base", "wav2vec2_large"].
+        feat_dim (int): Feature dimension of the SSL feature representation.
+        proj_dim (int): Output dimension of projection layer.
+        att_dim (int): Dimension of attention scores.
+    """
+    ssl_model = getattr(torchaudio.models, ssl_type)()
+    projector = nn.Linear(feat_dim, proj_dim)
+    predictor = Predictor(proj_dim * 2, att_dim)
+    return SquimSubjective(ssl_model, projector, predictor)
+
+
+def squim_subjective_base() -> SquimSubjective:
+    """Build :class:`torchaudio.prototype.models.SquimSubjective` model with default arguments."""
+    return squim_subjective_model(
+        ssl_type="wav2vec2_base",
+        feat_dim=768,
+        proj_dim=32,
+        att_dim=5,
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/models/tacotron2.py b/MLPY/Lib/site-packages/torchaudio/models/tacotron2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad4f9b21a66e69c3e0fdb8bb80e80cbcbe2ef429
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/tacotron2.py
@@ -0,0 +1,1046 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+
+
+__all__ = [
+    "Tacotron2",
+]
+
+
+def _get_linear_layer(in_dim: int, out_dim: int, bias: bool = True, w_init_gain: str = "linear") -> torch.nn.Linear:
+    r"""Linear layer with xavier uniform initialization.
+
+    Args:
+        in_dim (int): Size of each input sample.
+        out_dim (int): Size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias. (Default: ``True``)
+        w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain``
+            for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``)
+
+    Returns:
+        (torch.nn.Linear): The corresponding linear layer.
+    """
+    linear = torch.nn.Linear(in_dim, out_dim, bias=bias)
+    torch.nn.init.xavier_uniform_(linear.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+    return linear
+
+
+def _get_conv1d_layer(
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int = 1,
+    stride: int = 1,
+    padding: Optional[Union[str, int, Tuple[int]]] = None,
+    dilation: int = 1,
+    bias: bool = True,
+    w_init_gain: str = "linear",
+) -> torch.nn.Conv1d:
+    r"""1D convolution with xavier uniform initialization.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int, optional): Number of channels in the input image. (Default: ``1``)
+        stride (int, optional): Number of channels in the input image. (Default: ``1``)
+        padding (str, int or tuple, optional): Padding added to both sides of the input.
+            (Default: dilation * (kernel_size - 1) / 2)
+        dilation (int, optional): Number of channels in the input image. (Default: ``1``)
+        w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain``
+            for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``)
+
+    Returns:
+        (torch.nn.Conv1d): The corresponding Conv1D layer.
+    """
+    if padding is None:
+        if kernel_size % 2 != 1:
+            raise ValueError("kernel_size must be odd")
+        padding = int(dilation * (kernel_size - 1) / 2)
+
+    conv1d = torch.nn.Conv1d(
+        in_channels,
+        out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias,
+    )
+
+    torch.nn.init.xavier_uniform_(conv1d.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    return conv1d
+
+
+def _get_mask_from_lengths(lengths: Tensor) -> Tensor:
+    r"""Returns a binary mask based on ``lengths``. The ``i``-th row and ``j``-th column of the mask
+    is ``1`` if ``j`` is smaller than ``i``-th element of ``lengths.
+
+    Args:
+        lengths (Tensor): The length of each element in the batch, with shape (n_batch, ).
+
+    Returns:
+        mask (Tensor): The binary mask, with shape (n_batch, max of ``lengths``).
+    """
+    max_len = torch.max(lengths).item()
+    ids = torch.arange(0, max_len, device=lengths.device, dtype=lengths.dtype)
+    mask = (ids < lengths.unsqueeze(1)).byte()
+    mask = torch.le(mask, 0)
+    return mask
+
+
+class _LocationLayer(nn.Module):
+    r"""Location layer used in the Attention model.
+
+    Args:
+        attention_n_filter (int): Number of filters for attention model.
+        attention_kernel_size (int): Kernel size for attention model.
+        attention_hidden_dim (int): Dimension of attention hidden representation.
+    """
+
+    def __init__(
+        self,
+        attention_n_filter: int,
+        attention_kernel_size: int,
+        attention_hidden_dim: int,
+    ):
+        super().__init__()
+        padding = int((attention_kernel_size - 1) / 2)
+        self.location_conv = _get_conv1d_layer(
+            2,
+            attention_n_filter,
+            kernel_size=attention_kernel_size,
+            padding=padding,
+            bias=False,
+            stride=1,
+            dilation=1,
+        )
+        self.location_dense = _get_linear_layer(
+            attention_n_filter, attention_hidden_dim, bias=False, w_init_gain="tanh"
+        )
+
+    def forward(self, attention_weights_cat: Tensor) -> Tensor:
+        r"""Location layer used in the Attention model.
+
+        Args:
+            attention_weights_cat (Tensor): Cumulative and previous attention weights
+                with shape (n_batch, 2, max of ``text_lengths``).
+
+        Returns:
+            processed_attention (Tensor): Cumulative and previous attention weights
+                with shape (n_batch, ``attention_hidden_dim``).
+        """
+        # (n_batch, attention_n_filter, text_lengths.max())
+        processed_attention = self.location_conv(attention_weights_cat)
+        processed_attention = processed_attention.transpose(1, 2)
+        # (n_batch, text_lengths.max(), attention_hidden_dim)
+        processed_attention = self.location_dense(processed_attention)
+        return processed_attention
+
+
+class _Attention(nn.Module):
+    r"""Locally sensitive attention model.
+
+    Args:
+        attention_rnn_dim (int): Number of hidden units for RNN.
+        encoder_embedding_dim (int): Number of embedding dimensions in the Encoder.
+        attention_hidden_dim (int): Dimension of attention hidden representation.
+        attention_location_n_filter (int): Number of filters for Attention model.
+        attention_location_kernel_size (int): Kernel size for Attention model.
+    """
+
+    def __init__(
+        self,
+        attention_rnn_dim: int,
+        encoder_embedding_dim: int,
+        attention_hidden_dim: int,
+        attention_location_n_filter: int,
+        attention_location_kernel_size: int,
+    ) -> None:
+        super().__init__()
+        self.query_layer = _get_linear_layer(attention_rnn_dim, attention_hidden_dim, bias=False, w_init_gain="tanh")
+        self.memory_layer = _get_linear_layer(
+            encoder_embedding_dim, attention_hidden_dim, bias=False, w_init_gain="tanh"
+        )
+        self.v = _get_linear_layer(attention_hidden_dim, 1, bias=False)
+        self.location_layer = _LocationLayer(
+            attention_location_n_filter,
+            attention_location_kernel_size,
+            attention_hidden_dim,
+        )
+        self.score_mask_value = -float("inf")
+
+    def _get_alignment_energies(self, query: Tensor, processed_memory: Tensor, attention_weights_cat: Tensor) -> Tensor:
+        r"""Get the alignment vector.
+
+        Args:
+            query (Tensor): Decoder output with shape (n_batch, n_mels * n_frames_per_step).
+            processed_memory (Tensor): Processed Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, attention_hidden_dim).
+            attention_weights_cat (Tensor): Cumulative and previous attention weights
+                with shape (n_batch, 2, max of ``text_lengths``).
+
+        Returns:
+            alignment (Tensor): attention weights, it is a tensor with shape (batch, max of ``text_lengths``).
+        """
+
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(torch.tanh(processed_query + processed_attention_weights + processed_memory))
+
+        alignment = energies.squeeze(2)
+        return alignment
+
+    def forward(
+        self,
+        attention_hidden_state: Tensor,
+        memory: Tensor,
+        processed_memory: Tensor,
+        attention_weights_cat: Tensor,
+        mask: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        r"""Pass the input through the Attention model.
+
+        Args:
+            attention_hidden_state (Tensor): Attention rnn last output with shape (n_batch, ``attention_rnn_dim``).
+            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+            processed_memory (Tensor): Processed Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
+            attention_weights_cat (Tensor): Previous and cumulative attention weights
+                with shape (n_batch, current_num_frames * 2, max of ``text_lengths``).
+            mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames).
+
+        Returns:
+            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
+            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
+        """
+        alignment = self._get_alignment_energies(attention_hidden_state, processed_memory, attention_weights_cat)
+
+        alignment = alignment.masked_fill(mask, self.score_mask_value)
+
+        attention_weights = F.softmax(alignment, dim=1)
+        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+
+        return attention_context, attention_weights
+
+
+class _Prenet(nn.Module):
+    r"""Prenet Module. It is consists of ``len(output_size)`` linear layers.
+
+    Args:
+        in_dim (int): The size of each input sample.
+        output_sizes (list): The output dimension of each linear layers.
+    """
+
+    def __init__(self, in_dim: int, out_sizes: List[int]) -> None:
+        super().__init__()
+        in_sizes = [in_dim] + out_sizes[:-1]
+        self.layers = nn.ModuleList(
+            [_get_linear_layer(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, out_sizes)]
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Pass the input through Prenet.
+
+        Args:
+            x (Tensor): The input sequence to Prenet with shape (n_batch, in_dim).
+
+        Return:
+            x (Tensor): Tensor with shape (n_batch, sizes[-1])
+        """
+
+        for linear in self.layers:
+            x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
+        return x
+
+
+class _Postnet(nn.Module):
+    r"""Postnet Module.
+
+    Args:
+        n_mels (int): Number of mel bins.
+        postnet_embedding_dim (int): Postnet embedding dimension.
+        postnet_kernel_size (int): Postnet kernel size.
+        postnet_n_convolution (int): Number of postnet convolutions.
+    """
+
+    def __init__(
+        self,
+        n_mels: int,
+        postnet_embedding_dim: int,
+        postnet_kernel_size: int,
+        postnet_n_convolution: int,
+    ):
+        super().__init__()
+        self.convolutions = nn.ModuleList()
+
+        for i in range(postnet_n_convolution):
+            in_channels = n_mels if i == 0 else postnet_embedding_dim
+            out_channels = n_mels if i == (postnet_n_convolution - 1) else postnet_embedding_dim
+            init_gain = "linear" if i == (postnet_n_convolution - 1) else "tanh"
+            num_features = n_mels if i == (postnet_n_convolution - 1) else postnet_embedding_dim
+            self.convolutions.append(
+                nn.Sequential(
+                    _get_conv1d_layer(
+                        in_channels,
+                        out_channels,
+                        kernel_size=postnet_kernel_size,
+                        stride=1,
+                        padding=int((postnet_kernel_size - 1) / 2),
+                        dilation=1,
+                        w_init_gain=init_gain,
+                    ),
+                    nn.BatchNorm1d(num_features),
+                )
+            )
+
+        self.n_convs = len(self.convolutions)
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Pass the input through Postnet.
+
+        Args:
+            x (Tensor): The input sequence with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
+
+        Return:
+            x (Tensor): Tensor with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
+        """
+
+        for i, conv in enumerate(self.convolutions):
+            if i < self.n_convs - 1:
+                x = F.dropout(torch.tanh(conv(x)), 0.5, training=self.training)
+            else:
+                x = F.dropout(conv(x), 0.5, training=self.training)
+
+        return x
+
+
+class _Encoder(nn.Module):
+    r"""Encoder Module.
+
+    Args:
+        encoder_embedding_dim (int): Number of embedding dimensions in the encoder.
+        encoder_n_convolution (int): Number of convolution layers in the encoder.
+        encoder_kernel_size (int): The kernel size in the encoder.
+
+    Examples
+        >>> encoder = _Encoder(3, 512, 5)
+        >>> input = torch.rand(10, 20, 30)
+        >>> output = encoder(input)  # shape: (10, 30, 512)
+    """
+
+    def __init__(
+        self,
+        encoder_embedding_dim: int,
+        encoder_n_convolution: int,
+        encoder_kernel_size: int,
+    ) -> None:
+        super().__init__()
+
+        self.convolutions = nn.ModuleList()
+        for _ in range(encoder_n_convolution):
+            conv_layer = nn.Sequential(
+                _get_conv1d_layer(
+                    encoder_embedding_dim,
+                    encoder_embedding_dim,
+                    kernel_size=encoder_kernel_size,
+                    stride=1,
+                    padding=int((encoder_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain="relu",
+                ),
+                nn.BatchNorm1d(encoder_embedding_dim),
+            )
+            self.convolutions.append(conv_layer)
+
+        self.lstm = nn.LSTM(
+            encoder_embedding_dim,
+            int(encoder_embedding_dim / 2),
+            1,
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.lstm.flatten_parameters()
+
+    def forward(self, x: Tensor, input_lengths: Tensor) -> Tensor:
+        r"""Pass the input through the Encoder.
+
+        Args:
+            x (Tensor): The input sequences with shape (n_batch, encoder_embedding_dim, n_seq).
+            input_lengths (Tensor): The length of each input sequence with shape (n_batch, ).
+
+        Return:
+            x (Tensor): A tensor with shape (n_batch, n_seq, encoder_embedding_dim).
+        """
+
+        for conv in self.convolutions:
+            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+
+        x = x.transpose(1, 2)
+
+        input_lengths = input_lengths.cpu()
+        x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True)
+
+        outputs, _ = self.lstm(x)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
+
+        return outputs
+
+
+class _Decoder(nn.Module):
+    r"""Decoder with Attention model.
+
+    Args:
+        n_mels (int): number of mel bins
+        n_frames_per_step (int): number of frames processed per step, only 1 is supported
+        encoder_embedding_dim (int): the number of embedding dimensions in the encoder.
+        decoder_rnn_dim (int): number of units in decoder LSTM
+        decoder_max_step (int): maximum number of output mel spectrograms
+        decoder_dropout (float): dropout probability for decoder LSTM
+        decoder_early_stopping (bool): stop decoding when all samples are finished
+        attention_rnn_dim (int): number of units in attention LSTM
+        attention_hidden_dim (int): dimension of attention hidden representation
+        attention_location_n_filter (int): number of filters for attention model
+        attention_location_kernel_size (int): kernel size for attention model
+        attention_dropout (float): dropout probability for attention LSTM
+        prenet_dim (int): number of ReLU units in prenet layers
+        gate_threshold (float): probability threshold for stop token
+    """
+
+    def __init__(
+        self,
+        n_mels: int,
+        n_frames_per_step: int,
+        encoder_embedding_dim: int,
+        decoder_rnn_dim: int,
+        decoder_max_step: int,
+        decoder_dropout: float,
+        decoder_early_stopping: bool,
+        attention_rnn_dim: int,
+        attention_hidden_dim: int,
+        attention_location_n_filter: int,
+        attention_location_kernel_size: int,
+        attention_dropout: float,
+        prenet_dim: int,
+        gate_threshold: float,
+    ) -> None:
+
+        super().__init__()
+        self.n_mels = n_mels
+        self.n_frames_per_step = n_frames_per_step
+        self.encoder_embedding_dim = encoder_embedding_dim
+        self.attention_rnn_dim = attention_rnn_dim
+        self.decoder_rnn_dim = decoder_rnn_dim
+        self.prenet_dim = prenet_dim
+        self.decoder_max_step = decoder_max_step
+        self.gate_threshold = gate_threshold
+        self.attention_dropout = attention_dropout
+        self.decoder_dropout = decoder_dropout
+        self.decoder_early_stopping = decoder_early_stopping
+
+        self.prenet = _Prenet(n_mels * n_frames_per_step, [prenet_dim, prenet_dim])
+
+        self.attention_rnn = nn.LSTMCell(prenet_dim + encoder_embedding_dim, attention_rnn_dim)
+
+        self.attention_layer = _Attention(
+            attention_rnn_dim,
+            encoder_embedding_dim,
+            attention_hidden_dim,
+            attention_location_n_filter,
+            attention_location_kernel_size,
+        )
+
+        self.decoder_rnn = nn.LSTMCell(attention_rnn_dim + encoder_embedding_dim, decoder_rnn_dim, True)
+
+        self.linear_projection = _get_linear_layer(decoder_rnn_dim + encoder_embedding_dim, n_mels * n_frames_per_step)
+
+        self.gate_layer = _get_linear_layer(
+            decoder_rnn_dim + encoder_embedding_dim, 1, bias=True, w_init_gain="sigmoid"
+        )
+
+    def _get_initial_frame(self, memory: Tensor) -> Tensor:
+        r"""Gets all zeros frames to use as the first decoder input.
+
+        Args:
+            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+
+        Returns:
+            decoder_input (Tensor): all zeros frames with shape
+                (n_batch, max of ``text_lengths``, ``n_mels * n_frames_per_step``).
+        """
+
+        n_batch = memory.size(0)
+        dtype = memory.dtype
+        device = memory.device
+        decoder_input = torch.zeros(n_batch, self.n_mels * self.n_frames_per_step, dtype=dtype, device=device)
+        return decoder_input
+
+    def _initialize_decoder_states(
+        self, memory: Tensor
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+        r"""Initializes attention rnn states, decoder rnn states, attention
+        weights, attention cumulative weights, attention context, stores memory
+        and stores processed memory.
+
+        Args:
+            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+
+        Returns:
+            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
+            processed_memory (Tensor): Processed encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
+        """
+        n_batch = memory.size(0)
+        max_time = memory.size(1)
+        dtype = memory.dtype
+        device = memory.device
+
+        attention_hidden = torch.zeros(n_batch, self.attention_rnn_dim, dtype=dtype, device=device)
+        attention_cell = torch.zeros(n_batch, self.attention_rnn_dim, dtype=dtype, device=device)
+
+        decoder_hidden = torch.zeros(n_batch, self.decoder_rnn_dim, dtype=dtype, device=device)
+        decoder_cell = torch.zeros(n_batch, self.decoder_rnn_dim, dtype=dtype, device=device)
+
+        attention_weights = torch.zeros(n_batch, max_time, dtype=dtype, device=device)
+        attention_weights_cum = torch.zeros(n_batch, max_time, dtype=dtype, device=device)
+        attention_context = torch.zeros(n_batch, self.encoder_embedding_dim, dtype=dtype, device=device)
+
+        processed_memory = self.attention_layer.memory_layer(memory)
+
+        return (
+            attention_hidden,
+            attention_cell,
+            decoder_hidden,
+            decoder_cell,
+            attention_weights,
+            attention_weights_cum,
+            attention_context,
+            processed_memory,
+        )
+
+    def _parse_decoder_inputs(self, decoder_inputs: Tensor) -> Tensor:
+        r"""Prepares decoder inputs.
+
+        Args:
+            decoder_inputs (Tensor): Inputs used for teacher-forced training, i.e. mel-specs,
+                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``)
+
+        Returns:
+            inputs (Tensor): Processed decoder inputs with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``).
+        """
+        # (n_batch, n_mels, mel_specgram_lengths.max()) -> (n_batch, mel_specgram_lengths.max(), n_mels)
+        decoder_inputs = decoder_inputs.transpose(1, 2)
+        decoder_inputs = decoder_inputs.view(
+            decoder_inputs.size(0),
+            int(decoder_inputs.size(1) / self.n_frames_per_step),
+            -1,
+        )
+        # (n_batch, mel_specgram_lengths.max(), n_mels) -> (mel_specgram_lengths.max(), n_batch, n_mels)
+        decoder_inputs = decoder_inputs.transpose(0, 1)
+        return decoder_inputs
+
+    def _parse_decoder_outputs(
+        self, mel_specgram: Tensor, gate_outputs: Tensor, alignments: Tensor
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        r"""Prepares decoder outputs for output
+
+        Args:
+            mel_specgram (Tensor): mel spectrogram with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``)
+            gate_outputs (Tensor): predicted stop token with shape (max of ``mel_specgram_lengths``, n_batch)
+            alignments (Tensor): sequence of attention weights from the decoder
+                with shape (max of ``mel_specgram_lengths``, n_batch, max of ``text_lengths``)
+
+        Returns:
+            mel_specgram (Tensor): mel spectrogram with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``)
+            gate_outputs (Tensor): predicted stop token with shape (n_batch, max of ``mel_specgram_lengths``)
+            alignments (Tensor): sequence of attention weights from the decoder
+                with shape (n_batch, max of ``mel_specgram_lengths``, max of ``text_lengths``)
+        """
+        # (mel_specgram_lengths.max(), n_batch, text_lengths.max())
+        # -> (n_batch, mel_specgram_lengths.max(), text_lengths.max())
+        alignments = alignments.transpose(0, 1).contiguous()
+        # (mel_specgram_lengths.max(), n_batch) -> (n_batch, mel_specgram_lengths.max())
+        gate_outputs = gate_outputs.transpose(0, 1).contiguous()
+        # (mel_specgram_lengths.max(), n_batch, n_mels) -> (n_batch, mel_specgram_lengths.max(), n_mels)
+        mel_specgram = mel_specgram.transpose(0, 1).contiguous()
+        # decouple frames per step
+        shape = (mel_specgram.shape[0], -1, self.n_mels)
+        mel_specgram = mel_specgram.view(*shape)
+        # (n_batch, mel_specgram_lengths.max(), n_mels) -> (n_batch, n_mels, T_out)
+        mel_specgram = mel_specgram.transpose(1, 2)
+
+        return mel_specgram, gate_outputs, alignments
+
+    def decode(
+        self,
+        decoder_input: Tensor,
+        attention_hidden: Tensor,
+        attention_cell: Tensor,
+        decoder_hidden: Tensor,
+        decoder_cell: Tensor,
+        attention_weights: Tensor,
+        attention_weights_cum: Tensor,
+        attention_context: Tensor,
+        memory: Tensor,
+        processed_memory: Tensor,
+        mask: Tensor,
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+        r"""Decoder step using stored states, attention and memory
+
+        Args:
+            decoder_input (Tensor): Output of the Prenet with shape (n_batch, ``prenet_dim``).
+            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
+            memory (Tensor): Encoder output with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+            processed_memory (Tensor): Processed Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
+            mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames).
+
+        Returns:
+            decoder_output: Predicted mel spectrogram for the current frame with shape (n_batch, ``n_mels``).
+            gate_prediction (Tensor): Prediction of the stop token with shape (n_batch, ``1``).
+            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
+        """
+        cell_input = torch.cat((decoder_input, attention_context), -1)
+
+        attention_hidden, attention_cell = self.attention_rnn(cell_input, (attention_hidden, attention_cell))
+        attention_hidden = F.dropout(attention_hidden, self.attention_dropout, self.training)
+
+        attention_weights_cat = torch.cat((attention_weights.unsqueeze(1), attention_weights_cum.unsqueeze(1)), dim=1)
+        attention_context, attention_weights = self.attention_layer(
+            attention_hidden, memory, processed_memory, attention_weights_cat, mask
+        )
+
+        attention_weights_cum += attention_weights
+        decoder_input = torch.cat((attention_hidden, attention_context), -1)
+
+        decoder_hidden, decoder_cell = self.decoder_rnn(decoder_input, (decoder_hidden, decoder_cell))
+        decoder_hidden = F.dropout(decoder_hidden, self.decoder_dropout, self.training)
+
+        decoder_hidden_attention_context = torch.cat((decoder_hidden, attention_context), dim=1)
+        decoder_output = self.linear_projection(decoder_hidden_attention_context)
+
+        gate_prediction = self.gate_layer(decoder_hidden_attention_context)
+
+        return (
+            decoder_output,
+            gate_prediction,
+            attention_hidden,
+            attention_cell,
+            decoder_hidden,
+            decoder_cell,
+            attention_weights,
+            attention_weights_cum,
+            attention_context,
+        )
+
+    def forward(
+        self, memory: Tensor, mel_specgram_truth: Tensor, memory_lengths: Tensor
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        r"""Decoder forward pass for training.
+
+        Args:
+            memory (Tensor): Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+            mel_specgram_truth (Tensor): Decoder ground-truth mel-specs for teacher forcing
+                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
+            memory_lengths (Tensor): Encoder output lengths for attention masking
+                (the same as ``text_lengths``) with shape (n_batch, ).
+
+        Returns:
+            mel_specgram (Tensor): Predicted mel spectrogram
+                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
+            gate_outputs (Tensor): Predicted stop token for each timestep
+                with shape (n_batch,  max of ``mel_specgram_lengths``).
+            alignments (Tensor): Sequence of attention weights from the decoder
+                with shape (n_batch,  max of ``mel_specgram_lengths``, max of ``text_lengths``).
+        """
+
+        decoder_input = self._get_initial_frame(memory).unsqueeze(0)
+        decoder_inputs = self._parse_decoder_inputs(mel_specgram_truth)
+        decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
+        decoder_inputs = self.prenet(decoder_inputs)
+
+        mask = _get_mask_from_lengths(memory_lengths)
+        (
+            attention_hidden,
+            attention_cell,
+            decoder_hidden,
+            decoder_cell,
+            attention_weights,
+            attention_weights_cum,
+            attention_context,
+            processed_memory,
+        ) = self._initialize_decoder_states(memory)
+
+        mel_outputs, gate_outputs, alignments = [], [], []
+        while len(mel_outputs) < decoder_inputs.size(0) - 1:
+            decoder_input = decoder_inputs[len(mel_outputs)]
+            (
+                mel_output,
+                gate_output,
+                attention_hidden,
+                attention_cell,
+                decoder_hidden,
+                decoder_cell,
+                attention_weights,
+                attention_weights_cum,
+                attention_context,
+            ) = self.decode(
+                decoder_input,
+                attention_hidden,
+                attention_cell,
+                decoder_hidden,
+                decoder_cell,
+                attention_weights,
+                attention_weights_cum,
+                attention_context,
+                memory,
+                processed_memory,
+                mask,
+            )
+
+            mel_outputs += [mel_output.squeeze(1)]
+            gate_outputs += [gate_output.squeeze(1)]
+            alignments += [attention_weights]
+
+        mel_specgram, gate_outputs, alignments = self._parse_decoder_outputs(
+            torch.stack(mel_outputs), torch.stack(gate_outputs), torch.stack(alignments)
+        )
+
+        return mel_specgram, gate_outputs, alignments
+
+    def _get_go_frame(self, memory: Tensor) -> Tensor:
+        """Gets all zeros frames to use as the first decoder input
+
+        args:
+            memory (Tensor): Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+
+        returns:
+            decoder_input (Tensor): All zeros frames with shape(n_batch, ``n_mels`` * ``n_frame_per_step``).
+        """
+
+        n_batch = memory.size(0)
+        dtype = memory.dtype
+        device = memory.device
+        decoder_input = torch.zeros(n_batch, self.n_mels * self.n_frames_per_step, dtype=dtype, device=device)
+        return decoder_input
+
+    @torch.jit.export
+    def infer(self, memory: Tensor, memory_lengths: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Decoder inference
+
+        Args:
+            memory (Tensor): Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+            memory_lengths (Tensor): Encoder output lengths for attention masking
+                (the same as ``text_lengths``) with shape (n_batch, ).
+
+        Returns:
+            mel_specgram (Tensor): Predicted mel spectrogram
+                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
+            mel_specgram_lengths (Tensor): the length of the predicted mel spectrogram (n_batch, ))
+            gate_outputs (Tensor): Predicted stop token for each timestep
+                with shape (n_batch,  max of ``mel_specgram_lengths``).
+            alignments (Tensor): Sequence of attention weights from the decoder
+                with shape (n_batch,  max of ``mel_specgram_lengths``, max of ``text_lengths``).
+        """
+        batch_size, device = memory.size(0), memory.device
+
+        decoder_input = self._get_go_frame(memory)
+
+        mask = _get_mask_from_lengths(memory_lengths)
+        (
+            attention_hidden,
+            attention_cell,
+            decoder_hidden,
+            decoder_cell,
+            attention_weights,
+            attention_weights_cum,
+            attention_context,
+            processed_memory,
+        ) = self._initialize_decoder_states(memory)
+
+        mel_specgram_lengths = torch.zeros([batch_size], dtype=torch.int32, device=device)
+        finished = torch.zeros([batch_size], dtype=torch.bool, device=device)
+        mel_specgrams: List[Tensor] = []
+        gate_outputs: List[Tensor] = []
+        alignments: List[Tensor] = []
+        for _ in range(self.decoder_max_step):
+            decoder_input = self.prenet(decoder_input)
+            (
+                mel_specgram,
+                gate_output,
+                attention_hidden,
+                attention_cell,
+                decoder_hidden,
+                decoder_cell,
+                attention_weights,
+                attention_weights_cum,
+                attention_context,
+            ) = self.decode(
+                decoder_input,
+                attention_hidden,
+                attention_cell,
+                decoder_hidden,
+                decoder_cell,
+                attention_weights,
+                attention_weights_cum,
+                attention_context,
+                memory,
+                processed_memory,
+                mask,
+            )
+
+            mel_specgrams.append(mel_specgram.unsqueeze(0))
+            gate_outputs.append(gate_output.transpose(0, 1))
+            alignments.append(attention_weights)
+            mel_specgram_lengths[~finished] += 1
+
+            finished |= torch.sigmoid(gate_output.squeeze(1)) > self.gate_threshold
+            if self.decoder_early_stopping and torch.all(finished):
+                break
+
+            decoder_input = mel_specgram
+
+        if len(mel_specgrams) == self.decoder_max_step:
+            warnings.warn(
+                "Reached max decoder steps. The generated spectrogram might not cover " "the whole transcript."
+            )
+
+        mel_specgrams = torch.cat(mel_specgrams, dim=0)
+        gate_outputs = torch.cat(gate_outputs, dim=0)
+        alignments = torch.cat(alignments, dim=0)
+
+        mel_specgrams, gate_outputs, alignments = self._parse_decoder_outputs(mel_specgrams, gate_outputs, alignments)
+
+        return mel_specgrams, mel_specgram_lengths, gate_outputs, alignments
+
+
+class Tacotron2(nn.Module):
+    r"""Tacotron2 model from *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions*
+    :cite:`shen2018natural` based on the implementation from
+    `Nvidia Deep Learning Examples <https://github.com/NVIDIA/DeepLearningExamples/>`_.
+
+    See Also:
+        * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.
+
+    Args:
+        mask_padding (bool, optional): Use mask padding (Default: ``False``).
+        n_mels (int, optional): Number of mel bins (Default: ``80``).
+        n_symbol (int, optional): Number of symbols for the input text (Default: ``148``).
+        n_frames_per_step (int, optional): Number of frames processed per step, only 1 is supported (Default: ``1``).
+        symbol_embedding_dim (int, optional): Input embedding dimension (Default: ``512``).
+        encoder_n_convolution (int, optional): Number of encoder convolutions (Default: ``3``).
+        encoder_kernel_size (int, optional): Encoder kernel size (Default: ``5``).
+        encoder_embedding_dim (int, optional): Encoder embedding dimension (Default: ``512``).
+        decoder_rnn_dim (int, optional): Number of units in decoder LSTM (Default: ``1024``).
+        decoder_max_step (int, optional): Maximum number of output mel spectrograms (Default: ``2000``).
+        decoder_dropout (float, optional): Dropout probability for decoder LSTM (Default: ``0.1``).
+        decoder_early_stopping (bool, optional): Continue decoding after all samples are finished (Default: ``True``).
+        attention_rnn_dim (int, optional): Number of units in attention LSTM (Default: ``1024``).
+        attention_hidden_dim (int, optional): Dimension of attention hidden representation (Default: ``128``).
+        attention_location_n_filter (int, optional): Number of filters for attention model (Default: ``32``).
+        attention_location_kernel_size (int, optional): Kernel size for attention model (Default: ``31``).
+        attention_dropout (float, optional): Dropout probability for attention LSTM (Default: ``0.1``).
+        prenet_dim (int, optional): Number of ReLU units in prenet layers (Default: ``256``).
+        postnet_n_convolution (int, optional): Number of postnet convolutions (Default: ``5``).
+        postnet_kernel_size (int, optional): Postnet kernel size (Default: ``5``).
+        postnet_embedding_dim (int, optional): Postnet embedding dimension (Default: ``512``).
+        gate_threshold (float, optional): Probability threshold for stop token (Default: ``0.5``).
+    """
+
+    def __init__(
+        self,
+        mask_padding: bool = False,
+        n_mels: int = 80,
+        n_symbol: int = 148,
+        n_frames_per_step: int = 1,
+        symbol_embedding_dim: int = 512,
+        encoder_embedding_dim: int = 512,
+        encoder_n_convolution: int = 3,
+        encoder_kernel_size: int = 5,
+        decoder_rnn_dim: int = 1024,
+        decoder_max_step: int = 2000,
+        decoder_dropout: float = 0.1,
+        decoder_early_stopping: bool = True,
+        attention_rnn_dim: int = 1024,
+        attention_hidden_dim: int = 128,
+        attention_location_n_filter: int = 32,
+        attention_location_kernel_size: int = 31,
+        attention_dropout: float = 0.1,
+        prenet_dim: int = 256,
+        postnet_n_convolution: int = 5,
+        postnet_kernel_size: int = 5,
+        postnet_embedding_dim: int = 512,
+        gate_threshold: float = 0.5,
+    ) -> None:
+        super().__init__()
+
+        self.mask_padding = mask_padding
+        self.n_mels = n_mels
+        self.n_frames_per_step = n_frames_per_step
+        self.embedding = nn.Embedding(n_symbol, symbol_embedding_dim)
+        torch.nn.init.xavier_uniform_(self.embedding.weight)
+        self.encoder = _Encoder(encoder_embedding_dim, encoder_n_convolution, encoder_kernel_size)
+        self.decoder = _Decoder(
+            n_mels,
+            n_frames_per_step,
+            encoder_embedding_dim,
+            decoder_rnn_dim,
+            decoder_max_step,
+            decoder_dropout,
+            decoder_early_stopping,
+            attention_rnn_dim,
+            attention_hidden_dim,
+            attention_location_n_filter,
+            attention_location_kernel_size,
+            attention_dropout,
+            prenet_dim,
+            gate_threshold,
+        )
+        self.postnet = _Postnet(n_mels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolution)
+
+    def forward(
+        self,
+        tokens: Tensor,
+        token_lengths: Tensor,
+        mel_specgram: Tensor,
+        mel_specgram_lengths: Tensor,
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        r"""Pass the input through the Tacotron2 model. This is in teacher
+        forcing mode, which is generally used for training.
+
+        The input ``tokens`` should be padded with zeros to length max of ``token_lengths``.
+        The input ``mel_specgram`` should be padded with zeros to length max of ``mel_specgram_lengths``.
+
+        Args:
+            tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of token_lengths)`.
+            token_lengths (Tensor): The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
+            mel_specgram (Tensor): The target mel spectrogram
+                with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+            mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`.
+
+        Returns:
+            [Tensor, Tensor, Tensor, Tensor]:
+                Tensor
+                    Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+                Tensor
+                    Mel spectrogram after Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+                Tensor
+                    The output for stop token at each time step with shape `(n_batch, max of mel_specgram_lengths)`.
+                Tensor
+                    Sequence of attention weights from the decoder with
+                    shape `(n_batch, max of mel_specgram_lengths, max of token_lengths)`.
+        """
+
+        embedded_inputs = self.embedding(tokens).transpose(1, 2)
+
+        encoder_outputs = self.encoder(embedded_inputs, token_lengths)
+        mel_specgram, gate_outputs, alignments = self.decoder(
+            encoder_outputs, mel_specgram, memory_lengths=token_lengths
+        )
+
+        mel_specgram_postnet = self.postnet(mel_specgram)
+        mel_specgram_postnet = mel_specgram + mel_specgram_postnet
+
+        if self.mask_padding:
+            mask = _get_mask_from_lengths(mel_specgram_lengths)
+            mask = mask.expand(self.n_mels, mask.size(0), mask.size(1))
+            mask = mask.permute(1, 0, 2)
+
+            mel_specgram.masked_fill_(mask, 0.0)
+            mel_specgram_postnet.masked_fill_(mask, 0.0)
+            gate_outputs.masked_fill_(mask[:, 0, :], 1e3)
+
+        return mel_specgram, mel_specgram_postnet, gate_outputs, alignments
+
+    @torch.jit.export
+    def infer(self, tokens: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Tensor, Tensor]:
+        r"""Using Tacotron2 for inference. The input is a batch of encoded
+        sentences (``tokens``) and its corresponding lengths (``lengths``). The
+        output is the generated mel spectrograms, its corresponding lengths, and
+        the attention weights from the decoder.
+
+        The input `tokens` should be padded with zeros to length max of ``lengths``.
+
+        Args:
+            tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of lengths)`.
+            lengths (Tensor or None, optional):
+                The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
+                If ``None``, it is assumed that the all the tokens are valid. Default: ``None``
+
+        Returns:
+            (Tensor, Tensor, Tensor):
+                Tensor
+                    The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+                Tensor
+                    The length of the predicted mel spectrogram with shape `(n_batch, )`.
+                Tensor
+                    Sequence of attention weights from the decoder with shape
+                    `(n_batch, max of mel_specgram_lengths, max of lengths)`.
+        """
+        n_batch, max_length = tokens.shape
+        if lengths is None:
+            lengths = torch.tensor([max_length]).expand(n_batch).to(tokens.device, tokens.dtype)
+
+        assert lengths is not None  # For TorchScript compiler
+        embedded_inputs = self.embedding(tokens).transpose(1, 2)
+        encoder_outputs = self.encoder(embedded_inputs, lengths)
+        mel_specgram, mel_specgram_lengths, _, alignments = self.decoder.infer(encoder_outputs, lengths)
+
+        mel_outputs_postnet = self.postnet(mel_specgram)
+        mel_outputs_postnet = mel_specgram + mel_outputs_postnet
+
+        alignments = alignments.unfold(1, n_batch, n_batch).transpose(0, 2)
+
+        return mel_outputs_postnet, mel_specgram_lengths, alignments
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2letter.py b/MLPY/Lib/site-packages/torchaudio/models/wav2letter.py
new file mode 100644
index 0000000000000000000000000000000000000000..defe7902fbe3c20aeb9fdaa5b5f32840691f6c66
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/wav2letter.py
@@ -0,0 +1,72 @@
+from torch import nn, Tensor
+
+__all__ = [
+    "Wav2Letter",
+]
+
+
+class Wav2Letter(nn.Module):
+    r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
+    Recognition System* :cite:`collobert2016wav2letter`.
+
+    See Also:
+        * `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wav2letter>`__
+
+    Args:
+        num_classes (int, optional): Number of classes to be classified. (Default: ``40``)
+        input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum``
+         or ``mfcc`` (Default: ``waveform``).
+        num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
+    """
+
+    def __init__(self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1) -> None:
+        super().__init__()
+
+        acoustic_num_features = 250 if input_type == "waveform" else num_features
+        acoustic_model = nn.Sequential(
+            nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(inplace=True),
+        )
+
+        if input_type == "waveform":
+            waveform_model = nn.Sequential(
+                nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45),
+                nn.ReLU(inplace=True),
+            )
+            self.acoustic_model = nn.Sequential(waveform_model, acoustic_model)
+
+        if input_type in ["power_spectrum", "mfcc"]:
+            self.acoustic_model = acoustic_model
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""
+        Args:
+            x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length).
+
+        Returns:
+            Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).
+        """
+
+        x = self.acoustic_model(x)
+        x = nn.functional.log_softmax(x, dim=1)
+        return x
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__init__.py b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bd9720de106dd39f24f51ad267ec4b776cc3ab5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__init__.py
@@ -0,0 +1,45 @@
+from . import utils
+from .model import (
+    hubert_base,
+    hubert_large,
+    hubert_pretrain_base,
+    hubert_pretrain_large,
+    hubert_pretrain_model,
+    hubert_pretrain_xlarge,
+    hubert_xlarge,
+    HuBERTPretrainModel,
+    wav2vec2_base,
+    wav2vec2_large,
+    wav2vec2_large_lv60k,
+    wav2vec2_model,
+    wav2vec2_xlsr_1b,
+    wav2vec2_xlsr_2b,
+    wav2vec2_xlsr_300m,
+    Wav2Vec2Model,
+    wavlm_base,
+    wavlm_large,
+    wavlm_model,
+)
+
+__all__ = [
+    "Wav2Vec2Model",
+    "HuBERTPretrainModel",
+    "wavlm_model",
+    "wavlm_base",
+    "wavlm_large",
+    "wav2vec2_model",
+    "wav2vec2_base",
+    "wav2vec2_large",
+    "wav2vec2_large_lv60k",
+    "hubert_base",
+    "hubert_large",
+    "hubert_xlarge",
+    "hubert_pretrain_model",
+    "hubert_pretrain_base",
+    "hubert_pretrain_large",
+    "hubert_pretrain_xlarge",
+    "utils",
+    "wav2vec2_xlsr_300m",
+    "wav2vec2_xlsr_1b",
+    "wav2vec2_xlsr_2b",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..182cab56a32f0db5fe22316c06e0fed463f41dac
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/components.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/components.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e6cec119a64aa2d30dfcbc4cd188541c3ae1e6d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/components.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/model.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/model.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fe57823833f3f5bc72d0a80408ce6c49c1be8ac
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/model.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/wavlm_attention.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/wavlm_attention.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d59d898beae426c30997fc37007365281056bef9
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/__pycache__/wavlm_attention.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/components.py b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/components.py
new file mode 100644
index 0000000000000000000000000000000000000000..8489c5e1dd996d5e7f1a707b855ed87a6a7da99e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/components.py
@@ -0,0 +1,1167 @@
+import logging
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn, Tensor
+from torch.nn import Module, Parameter
+
+from .wavlm_attention import WavLMSelfAttention
+
+_LG = logging.getLogger(__name__)
+
+
+def _init_transformer_params(module):
+    """
+    Initialize the weights of Transformer module in Wav2Vec2/HuBERT.
+
+    If the module is ``nn.Linear``, normalize the weight with mean 0 and standard deviation 0.02.
+    If ``bias`` is set to ``True`` in the module, set ``bias`` to 0.
+
+    If the module is ``nn.Embedding``, normalize the weight with mean 0 and standard deviation 0.02.
+    If ``padding_idx`` is not None, set the weight of padding to 0.
+
+    Note:
+        Ths method corresponds to
+        `init_bert_params
+        <https://github.com/facebookresearch/fairseq/blob/main/fairseq/modules/transformer_sentence_encoder.py#L21>`__
+        in the original ``fairseq`` implementation.
+    """
+
+    def normal_(data):
+        data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
+
+    if isinstance(module, nn.Linear):
+        normal_(module.weight.data)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    if isinstance(module, nn.Embedding):
+        normal_(module.weight.data)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+
+
+class LayerNorm(nn.LayerNorm):
+    """Layer norm with transpose"""
+
+    def forward(self, input: Tensor) -> Tensor:
+        x = input.transpose(-2, -1)
+        x = nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        x = x.transpose(-2, -1)
+        return x
+
+
+class ConvLayerBlock(Module):
+    """Convolution unit of FeatureExtractor"""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        bias: bool,
+        layer_norm: Optional[Module],
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.layer_norm = layer_norm
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            bias=bias,
+        )
+
+    def forward(
+        self,
+        x: Tensor,
+        length: Optional[Tensor],
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            x (Tensor): Shape: ``[batch, in_channels, in_frame]``.
+            length (Tensor or None, optional): Shape ``[batch, ]``.
+        Returns:
+            Tensor: Shape ``[batch, out_channels, out_frames]``.
+            Optional[Tensor]: Shape ``[batch, ]``.
+        """
+        x = self.conv(x)
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+        x = nn.functional.gelu(x)
+
+        if length is not None:
+            length = torch.div(length - self.kernel_size, self.stride, rounding_mode="floor") + 1
+            # When input length is 0, the resulting length can be negative. So fix it here.
+            length = torch.max(torch.zeros_like(length), length)
+        return x, length
+
+
+class FeatureExtractor(Module):
+    """Extract features from audio
+
+    Args:
+        conv_layers (nn.ModuleList):
+            convolution layers
+    """
+
+    def __init__(
+        self,
+        conv_layers: nn.ModuleList,
+    ):
+        super().__init__()
+        self.conv_layers = conv_layers
+
+    def forward(
+        self,
+        x: Tensor,
+        length: Optional[Tensor],
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            x (Tensor):
+                Input Tensor representing a batch of audio,
+                shape: ``[batch, time]``.
+            length (Tensor or None, optional):
+                Valid length of each input sample. shape: ``[batch, ]``.
+
+        Returns:
+            Tensor:
+                The resulting feature, shape: ``[batch, frame, feature]``
+            Optional[Tensor]:
+                Valid length of each output sample. shape: ``[batch, ]``.
+        """
+        if x.ndim != 2:
+            raise ValueError(f"Expected the input Tensor to be 2D (batch, time). Found: {list(x.shape)}")
+
+        x = x.unsqueeze(1)  # (batch, channel==1, frame)
+        for layer in self.conv_layers:
+            x, length = layer(x, length)  # (batch, feature, frame)
+        x = x.transpose(1, 2)  # (batch, frame, feature)
+        return x, length
+
+
+class FeatureProjection(Module):
+    """Layer that connects FeatureExtractor and Encoder
+
+    Projects features to encoder dimension.
+
+    Args:
+        in_features (int): Input feature dim.
+        out_features (int): Output feature dim.
+        dropout (float): Dropout probability.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        dropout: float,
+    ):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(in_features)
+        self.projection = nn.Linear(
+            in_features,
+            out_features,
+        )
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor):
+                Feature Tensor. shape: ``[batch, frame, in_feature]``
+        Returns:
+            Tensor: Projected features. ``[batch, frame, out_feature]``.
+        """
+        x = self.layer_norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+
+
+class ConvolutionalPositionalEmbedding(Module):
+    """Positional embedding which is placed at the beginning of Transformer.
+
+    Args:
+        embed_dim (int): Feature dimension of the input Tensor.
+        kernel_size (int): The number of frames to be use.
+        groups (int): The number of groups in feature dimensions.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        kernel_size: int,
+        groups: int,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kernel_size = kernel_size
+        self.conv = nn.Conv1d(
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            groups=groups,
+        )
+
+        self.conv = nn.utils.parametrizations.weight_norm(self.conv, name="weight", dim=2)
+        self.num_remove: int = 1 if kernel_size % 2 == 0 else 0
+
+    def __prepare_scriptable__(self):
+        if self.conv.__class__.__name__ == "ParametrizedConv1d":
+            _LG.warning("Removing weight_norm from %s", self.__class__.__name__)
+            torch.nn.utils.parametrize.remove_parametrizations(self.conv, "weight")
+        return self
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): shape ``[batch, frame, feature]``.
+
+        Returns:
+            Tensor: The resulting feature. Shape ``[batch, frame, feature]``.
+        """
+        x = x.transpose(-2, -1)
+        x = self.conv(x)
+        if self.num_remove > 0:
+            x = x[..., : -self.num_remove]
+        x = torch.nn.functional.gelu(x)
+        x = x.transpose(-2, -1)
+        return x
+
+
+class SelfAttention(Module):
+    """Multihead Self Attention module
+
+    Args:
+        embed_dim (int): Total dimension of the model.
+        num_heads (int): The number of heads.
+        dropout (float, optional):
+            Dropout probability on attn_output_weights. Default: ``0.0``
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        head_dim = embed_dim // num_heads
+        if head_dim * num_heads != embed_dim:
+            raise ValueError(f"`embed_dim ({embed_dim})` is not divisible by `num_heads ({num_heads})`")
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = head_dim
+
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+
+    def forward(
+        self,
+        x: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        position_bias: Optional[Tensor] = None,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            x (Tensor): shape: ``[batch_size, sequence_length, embed_dim]``.
+            attention_mask (Tensor or ``None``, optional):
+                shape: ``[batch_size, 1, sequence_length, sequence_length]``
+            position_bias: Not used. Only for the compatibility with :py:class:`WavLMSelfAttention`.
+            key_padding_mask (Tensor or ``None``): Not used. Only for the compatibility with
+                :py:class:`WavLMSelfAttention`.
+        Returns:
+            (Tensor, ``None``): The resulting attention output and ``None`` (necessary for compatibility
+                with :py:class:`WavLMSelAttention`).
+                Attention output shape: ``[batch, sequence_length, embed_dim]``.
+        """
+        if x.ndim != 3 or x.shape[2] != self.embed_dim:
+            raise ValueError(
+                f"The expected input shape is (batch, sequence, embed_dim=={self.embed_dim}). " f"Found {x.shape}."
+            )
+        batch_size, length, embed_dim = x.size()
+        if attention_mask is not None:
+            shape_ = (batch_size, 1, length, length)
+            if attention_mask.size() != shape_:
+                raise ValueError(f"The expected attention mask shape is {shape_}. " f"Found {attention_mask.size()}.")
+
+        shape = (batch_size, length, self.num_heads, self.head_dim)
+        q = self.q_proj(x).view(*shape).transpose(2, 1)  # B, nH, L, Hd
+        k = self.k_proj(x).view(*shape).transpose(2, 1)  # B, nH, L, Hd
+        v = self.v_proj(x).view(*shape).transpose(2, 1)  # B, nH, L, Hd
+        dropout = self.dropout if self.training else 0.0
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=attention_mask, dropout_p=dropout, is_causal=False
+        )
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        output = self.out_proj(attn_output)
+        return output, None  # Necessary for compatibility with WavLMSelAttention
+
+
+class FeedForward(Module):
+    """Layer that follows attention layer in encoder layer."""
+
+    def __init__(
+        self,
+        io_features: int,
+        intermediate_features: int,
+        intermediate_dropout: float,
+        output_dropout: float,
+    ):
+        super().__init__()
+        self.intermediate_dense = nn.Linear(io_features, intermediate_features)
+        self.intermediate_dropout = nn.Dropout(intermediate_dropout)
+        self.output_dense = nn.Linear(intermediate_features, io_features)
+        self.output_dropout = nn.Dropout(output_dropout)
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): shape: `(batch, sequence_length, io_features)`
+        Returns:
+            x (Tensor): shape: `(batch, sequence_length, io_features)`
+        """
+        x = self.intermediate_dense(x)
+        x = torch.nn.functional.gelu(x)
+        x = self.intermediate_dropout(x)
+
+        x = self.output_dense(x)
+        x = self.output_dropout(x)
+        return x
+
+
+class EncoderLayer(Module):
+    """A layer unit in encoder. Combines multihead self attention and feed forward."""
+
+    def __init__(
+        self,
+        attention: Module,
+        dropout: float,
+        layer_norm_first: bool,
+        feed_forward: Module,
+    ):
+        super().__init__()
+        self.attention = attention
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(attention.embed_dim)
+        self.layer_norm_first = layer_norm_first
+        self.feed_forward = feed_forward
+        self.final_layer_norm = nn.LayerNorm(attention.embed_dim)
+
+    def forward(
+        self,
+        x: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        position_bias: Optional[Tensor] = None,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            x (Tensor): Input of shape ``(batch, sequence_length, embed_dim)``.
+            attention_mask (Tensor or ``None``, optional): attention mask
+                of shape ``(batch, 1, sequence_length, sequence_length)``. (Default: ``None``)
+            position_bias (Tensor or ``None``, optional): position bias of shape
+                ``(batch_size * num_heads, src_len, src_len)``.
+                Only necessary for WavLM model, ``None`` otherwise. (Default: ``None``)
+            key_padding_mask (Tensor or ``None``, optional): key padding mask of shape ``(batch_size, src_len)``.
+                Only used for WavLM model, ignored otherwise. (Default: ``None``)
+        Returns:
+            (x, position_bias): Shapes are the same as in the input. Position bias is only relevant for WaLM model,
+                ``None`` otherwise.
+        """
+        residual = x
+
+        if self.layer_norm_first:
+            x = self.layer_norm(x)
+
+        x, position_bias = self.attention(
+            x, attention_mask=attention_mask, position_bias=position_bias, key_padding_mask=key_padding_mask
+        )
+
+        x = self.dropout(x)
+        x = residual + x
+
+        if self.layer_norm_first:
+            x = x + self.feed_forward(self.final_layer_norm(x))
+        else:
+            x = self.layer_norm(x)
+            x = self.final_layer_norm(x + self.feed_forward(x))
+        return x, position_bias
+
+
+class Transformer(Module):
+    def __init__(
+        self,
+        pos_conv_embed: Module,
+        dropout: float,
+        layers: Module,
+        layer_norm_first: bool,
+        layer_drop: float,
+    ):
+        super().__init__()
+        self.pos_conv_embed = pos_conv_embed
+        self.layer_norm = nn.LayerNorm(pos_conv_embed.embed_dim)
+        self.layer_norm_first = layer_norm_first
+        self.layer_drop = layer_drop
+        self.dropout = nn.Dropout(dropout)
+        self.layers = layers
+
+    def _preprocess(self, x: Tensor):
+        x = x + self.pos_conv_embed(x)
+
+        if self.layer_norm_first:
+            x = self.layer_norm(x)
+
+        x = self.dropout(x)
+        return x
+
+    def forward(
+        self,
+        x: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        position_bias: Optional[Tensor] = None,
+    ) -> Tensor:
+        x = self._preprocess(x)
+        for layer in self.layers:
+            if not (self.training and torch.rand(1).item() <= self.layer_drop):
+                x, position_bias = layer(x, attention_mask, position_bias=position_bias)
+
+        if not self.layer_norm_first:
+            x = self.layer_norm(x)
+        return x
+
+    def get_intermediate_outputs(
+        self,
+        x: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> List[Tensor]:
+        if num_layers is not None:
+            if not 0 < num_layers <= len(self.layers):
+                raise ValueError(f"`num_layers` must be between [1, {len(self.layers)}]")
+
+        ret: List[Tensor] = []
+        position_bias = None
+        x = self._preprocess(x)
+        for layer in self.layers:
+            x, position_bias = layer(x, attention_mask, position_bias=position_bias)
+            ret.append(x)
+            if num_layers is not None and len(ret) >= num_layers:
+                return ret
+        return ret
+
+
+class Encoder(Module):
+    def __init__(
+        self,
+        feature_projection: Module,
+        transformer: Module,
+    ):
+        super().__init__()
+        self.feature_projection = feature_projection
+        self.transformer = transformer
+
+    def _preprocess(
+        self,
+        features: Tensor,
+        lengths: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        x = self.feature_projection(features)
+
+        mask: Optional[Tensor] = None
+        if lengths is not None:
+            batch_size, max_len, _ = x.shape
+            # create mask for padded elements and zero-out them
+            mask = torch.arange(max_len, device=lengths.device).expand(batch_size, max_len) >= lengths[:, None]
+            x[mask] = 0.0
+            # extend the mask to attention shape and set weight
+            mask = -10000.0 * mask[:, None, None, :].to(dtype=features.dtype)
+            mask = mask.expand(batch_size, 1, max_len, max_len)
+        return x, mask
+
+    def forward(
+        self,
+        features: Tensor,
+        lengths: Optional[Tensor] = None,
+    ) -> Tensor:
+        x, mask = self._preprocess(features, lengths)
+        x = self.transformer(x, attention_mask=mask)
+        return x
+
+    def extract_features(
+        self,
+        features: Tensor,
+        lengths: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> List[Tensor]:
+        x, masks = self._preprocess(features, lengths)
+        return self.transformer.get_intermediate_outputs(x, attention_mask=masks, num_layers=num_layers)
+
+
+################################################################################
+def _get_feature_extractor(
+    norm_mode: str,
+    shapes: List[Tuple[int, int, int]],
+    bias: bool,
+) -> FeatureExtractor:
+    """
+    Args:
+        norm_mode (str):
+            Either "group_norm" or "layer_norm".
+            If "group_norm", then a single normalization is applied
+            in the first convolution block. Otherwise, all the convolution
+            blocks will have layer normalization.
+            This option corresponds to "extractor_mode" from fairseq.
+            Expected values are "group_norm" for Base arch, and
+            "layer_norm" for Large arch.
+        shapes (list of tuple of int):
+            Configuration of convolution layers. List of convolution configuration,
+            i.e. ``[(output_channel, kernel_size, stride), ...]``
+            This option corresponds to "conv_feature_layers" from fairseq.
+            Expected values are
+            ``[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2``
+            for all the architectures.
+        bias (bool):
+            Whether to include bias term to each convolution operation.
+            This option corresponds to "conv_bias" from fairseq.
+            Expected values are False for Base arch, and True for Large arch.
+
+    See Also:
+        * Original implementation
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L666-L733
+        * "extractor_mode"
+          - Def and base:
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L38-L45
+          - Large:
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L52
+        * "conv_feature_layers"
+          - Def, base and large:
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L94-L100
+        * "conv_bias"
+          - Def and base:
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L101-L103
+          - Large:
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L61
+    """
+    if norm_mode not in ["group_norm", "layer_norm"]:
+        raise ValueError("Invalid norm mode")
+    blocks = []
+    in_channels = 1
+    for i, (out_channels, kernel_size, stride) in enumerate(shapes):
+        normalization = None
+        if norm_mode == "group_norm" and i == 0:
+            normalization = nn.GroupNorm(
+                num_groups=out_channels,
+                num_channels=out_channels,
+                affine=True,
+            )
+        elif norm_mode == "layer_norm":
+            normalization = LayerNorm(
+                normalized_shape=out_channels,
+                elementwise_affine=True,
+            )
+        blocks.append(
+            ConvLayerBlock(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                bias=bias,
+                layer_norm=normalization,
+            )
+        )
+        in_channels = out_channels
+    return FeatureExtractor(nn.ModuleList(blocks))
+
+
+def _get_encoder(
+    in_features: int,
+    embed_dim: int,
+    dropout_input: float,
+    pos_conv_kernel: int,
+    pos_conv_groups: int,
+    num_layers: int,
+    num_heads: int,
+    attention_dropout: float,
+    ff_interm_features: int,
+    ff_interm_dropout: float,
+    dropout: float,
+    layer_norm_first: bool,
+    layer_drop: float,
+) -> Encoder:
+    """
+    Args:
+        in_features (int): The number of input features.
+        embed_dim (int):
+            The dimension of embedding.
+            This option corresponds to "encoder_embed_dim" from fairseq.
+            Expected values are 768 for Base arch, and 1024 for Large arch.
+        dropout_input (float):
+            The dropout probability applied after the input feature is projected
+            to ``embed_dim``.
+            This option corresponds to "dropout_input" from fairseq.
+            Expected values are 0.1 for both Base and Large arch.
+        pos_conv_kernel (int):
+            The kernel size of convolutional positional embeddings.
+            This option corresponds to "conv_pos" from fairseq.
+            Expected values are 128 for both Base and Large arch.
+        pos_conv_groups (int):
+            The number of groups of convolutional positional embeddings.
+            This option corresponds to "conv_pos_groups" from fairseq.
+            Expected values are 16 for both Base and Large arch.
+        num_layers (int):
+            The number of self attention layers in transformer block.
+            This option corresponds to "encoder_layers" from fairseq.
+            Expected values are 12 for Base and 24 for Large arch.
+        num_heads (int):
+            The number of heads in self attention layers.
+            This option corresponds to "encoder_attention_heads" from fairseq.
+            Expected values are 12 for Base and 16 for Large arch.
+        attention_dropout (float):
+            The dropout probability applied after softmax in self-attention layer.
+            This option corresponds to "attention_dropout" from fairseq.
+            Expected values are 0.1 for Base and 0.0 for Large arch.
+        ff_interm_features (int):
+            The dimension of hidden features in feed forward layer.
+            This option corresponds to "encoder_ffn_embed_dim" from fairseq.
+            Expected values are 3072 for Base and 4096 for Large arch.
+        ff_interm_dropout (float):
+            The dropout probability applied in feedforward layer.
+            This option correspinds to "activation_dropout" from fairseq.
+            Expected values are 0.1 for both Base and Large arch.
+        dropout (float):
+            The dropout probability applied at the end of feed forward layer.
+            This option corresponds to "dropout" from fairseq.
+            Expected values are 0.1 for Base and 0.0 for Large arch.
+        layer_norm_first (bool):
+            Control the order of layer norm in transformer layer and each encoder layer.
+            If True, in transformer layer, layer norm is applied before features are fed
+            to encoder layers. In encoder layer, two layer norms are applied before and after
+            self attention.
+            If False, in transformer layer, layer norm is applied after features are fed
+            to encoder layers. In encoder layer, two layer norms are applied after self
+            attention, before and after feed forward.
+            This option corresponds to "layer_norm_first" from fairseq.
+            Expected values are False for Base and True for Large arch.
+        layer_drop (float):
+            Probability to drop each encoder layer during training.
+            This option corresponds to "layerdrop" from fairseq.
+            Expected values are 0.1 for both Base and Large arch.
+
+    See Also:
+        * "encoder_embed_dim"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L49-L51
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L64
+        * "dropout_input"
+          - Def, base and large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L75-L78
+        * "conv_pos"
+          - Def, base and large
+            NOTE: The description is wrong.
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L204-L207
+          - Usage
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L756
+        * "conv_pos_groups"
+          - Def, base and large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L208-L211
+        * "encoder_layers"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L46-L48
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L63
+        * "encoder_attention_heads"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L55-L57
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L66
+        * "attention_dropout"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L66-L68
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L60
+        * "encoder_ffn_embed_dim"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L52-L54
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L65
+        * "activation_dropout"
+          - Def
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L69-L71
+          - Base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/base_960h.yaml#L55
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/vox_960h.yaml#L55
+        * "dropout"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L63-L65
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L59
+        * "layer_norm_first"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L91-L93
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L53
+        * "layerdrop"
+          - Def
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L72-L74
+          - Base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/base_960h.yaml#L54
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/vox_960h.yaml#L54
+    """
+    feature_projection = FeatureProjection(in_features, embed_dim, dropout_input)
+    pos_conv = ConvolutionalPositionalEmbedding(embed_dim, pos_conv_kernel, pos_conv_groups)
+
+    # Original impl
+    # https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L768-L782
+    encoder_layers = nn.ModuleList()
+    for _ in range(num_layers):
+        attention = SelfAttention(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=attention_dropout,
+        )
+        feed_forward = FeedForward(
+            io_features=embed_dim,
+            intermediate_features=ff_interm_features,
+            intermediate_dropout=ff_interm_dropout,
+            output_dropout=dropout,
+        )
+        encoder_layers.append(
+            EncoderLayer(
+                attention=attention,
+                dropout=dropout,
+                layer_norm_first=layer_norm_first,
+                feed_forward=feed_forward,
+            )
+        )
+    transformer = Transformer(
+        pos_conv_embed=pos_conv,
+        dropout=dropout,
+        layers=encoder_layers,
+        layer_norm_first=not layer_norm_first,
+        layer_drop=layer_drop,
+    )
+    return Encoder(feature_projection, transformer)
+
+
+def _get_wavlm_encoder(
+    in_features: int,
+    embed_dim: int,
+    dropout_input: float,
+    pos_conv_kernel: int,
+    pos_conv_groups: int,
+    num_layers: int,
+    num_heads: int,
+    num_buckets: int,
+    max_distance: int,
+    attention_dropout: float,
+    ff_interm_features: int,
+    ff_interm_dropout: float,
+    dropout: float,
+    layer_norm_first: bool,
+    layer_drop: float,
+) -> Encoder:
+    """
+    Construct encoder for WavLM model :cite:`chen2022wavlm`. The structure of the encoder and most of the argments are
+    the same as in :py:func:`_get_encoder` so refer there for documentation. The only difference from Wav2Vec2 encoder
+    is usage of `WavLMSelfAttention` instead of `SelfAttention` and two additional parameters: `num_buckets` and
+    `max_distance`.
+    Args:
+        in_features (int): See :py:func:`_get_encoder`.
+        embed_dim (int): See :py:func:`_get_encoder`.
+        dropout_input (float): See :py:func:`_get_encoder`.
+        pos_conv_kernel (int): See :py:func:`_get_encoder`.
+        pos_conv_groups (int): See :py:func:`_get_encoder`.
+        num_layers (int): See :py:func:`_get_encoder`.
+        num_heads (int): See :py:func:`_get_encoder`.
+        num_buckets (int): Number of buckets for relative position embedding.
+        max_distance (int): Maximum distance for relative position embedding.
+        attention_dropout (float): See :py:func:`_get_encoder`.
+        ff_interm_features (int): See :py:func:`_get_encoder`.
+        ff_interm_dropout (float): See :py:func:`_get_encoder`.
+        dropout (float): See :py:func:`_get_encoder`.
+        layer_norm_first (bool): See :py:func:`_get_encoder`.
+        layer_drop (float): See :py:func:`_get_encoder`.
+
+    """
+    feature_projection = FeatureProjection(in_features, embed_dim, dropout_input)
+    pos_conv = ConvolutionalPositionalEmbedding(embed_dim, pos_conv_kernel, pos_conv_groups)
+
+    # Original impl
+    # https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L768-L782
+    encoder_layers = nn.ModuleList()
+    for i in range(num_layers):
+        attention = WavLMSelfAttention(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            num_buckets=num_buckets,
+            max_distance=max_distance,
+            dropout=attention_dropout,
+            has_relative_attention_bias=(i == 0),  # Position embedding is only necessary in the first layer.
+        )
+        feed_forward = FeedForward(
+            io_features=embed_dim,
+            intermediate_features=ff_interm_features,
+            intermediate_dropout=ff_interm_dropout,
+            output_dropout=dropout,
+        )
+        encoder_layers.append(
+            EncoderLayer(
+                attention=attention,
+                dropout=dropout,
+                layer_norm_first=layer_norm_first,
+                feed_forward=feed_forward,
+            )
+        )
+    transformer = Transformer(
+        pos_conv_embed=pos_conv,
+        dropout=dropout,
+        layers=encoder_layers,
+        layer_norm_first=not layer_norm_first,
+        layer_drop=layer_drop,
+    )
+    return Encoder(feature_projection, transformer)
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+) -> Tensor:
+    """Computes random mask spans for a given shape.
+    Args:
+        shape (int, int): The shape for which to compute masks.
+            The first element is batch size and second is the number of frames.
+        padding_mask (Tensor or None): The padding mask of the same dimension as shape,
+            which will prevent masking padded elements.
+        mask_prob (float): Probability for each token to be chosen as start of the span to be masked.
+            This will be multiplied by number of timesteps divided by length of mask span to mask
+            approximately this percentage of all elements. However due to overlaps, the actual number
+            will be smaller (unless no_overlap is True).
+        mask_type (str): How to compute mask lengths. Options: [``static``, ``uniform``, ``normal``, ``poisson``].
+            ``static``: Fixed size
+            ``uniform``: Sample from uniform distribution [mask_other, mask_length*2]
+            ``normal``: Sample from normal distribution with mean ``mask_length`` and stdev ``mask_other``.
+            ``poisson``: Sample from possion distribution with lambda = ``mask_length``.
+        min_masks (int): Minimum number of masked spans.
+        no_overlap (bool): If false, will switch to an alternative recursive algorithm
+            that prevents spans from overlapping.
+        min_space (int): How many frames to keep unmasked between spans (Only used if no_overlap is True).
+
+    Returns:
+        (Tensor): The mask indices of dimension `[batch, frame]`.
+    """
+
+    batch_size, frame = shape
+    mask = torch.full((batch_size, frame), False)
+    # add a random number for probabilistic rounding
+    all_num_mask = int(mask_prob * frame / float(mask_length) + torch.rand(1))
+
+    all_num_mask = max(min_masks, all_num_mask)
+
+    mask_idcs = []
+    for i in range(batch_size):
+        if padding_mask is not None:
+            sz = frame - padding_mask[i].long().sum().item()
+            # add a random number for probabilistic rounding
+            num_mask = int(mask_prob * sz / float(mask_length) + torch.rand(1))
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = frame
+            num_mask = all_num_mask
+
+        if mask_type == "static":
+            lengths = torch.full((num_mask,), mask_length)
+        elif mask_type == "uniform":
+            lengths = torch.randint(int(mask_other), mask_length * 2 + 1, size=(num_mask,))
+        elif mask_type == "normal":
+            lengths = torch.normal(mask_length, mask_other, size=(num_mask,))
+            lengths = torch.maximum(torch.ones(1), torch.round(lengths)).int()
+        elif mask_type == "poisson":
+            lengths = torch.poisson(mask_length, size=(num_mask,))
+            lengths = torch.round(lengths).int()
+        else:
+            raise Exception(f"unknown mask selection: {mask_type}")
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        if no_overlap:
+            mask_idc = []
+
+            def arrange(s, e, length, keep_length):
+                span_start = torch.randint(s, e - length, size=(1,))
+                mask_idc.extend(span_start + i for i in range(length))
+
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - keep_length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = torch.tensor([e - s for s, e in parts], dtype=torch.int)
+                lens[lens < length + min_space] = 0
+                l_sum = lens.sum()
+                if l_sum == 0:
+                    break
+                probs = lens / l_sum
+                c = torch.distributions.categorical.Categorical(probs).sample()
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = torch.tensor(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+
+            mask_idc = torch.randperm(sz - min_len)[:num_mask]
+            mask_idc = torch.tensor(
+                [mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])]
+            )
+
+        mask_idcs.append(torch.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = mask_idc[torch.randperm(len(mask_idc))[:min_len].long()]
+        mask[i, mask_idc] = True
+
+    return mask
+
+
+def _get_padding_mask(input: Tensor, lengths: Tensor) -> Tensor:
+    """Generate the padding mask given the padded input and the lengths Tensors.
+    Args:
+        input (Tensor): The padded Tensor of dimension `[batch, max_len, frequency]`.
+        lengths (Tensor): The lengths Tensor of dimension `[batch,]`.
+
+    Returns:
+        (Tensor): The padding mask.
+    """
+    batch_size, max_len, _ = input.shape
+    mask = torch.arange(max_len, device=lengths.device).expand(batch_size, max_len) >= lengths[:, None]
+    return mask
+
+
+class MaskGenerator(Module):
+    """Generate the masks for masked prediction.
+    Args:
+        encoder_embed_dim (int): The dimension of the transformer embedding output.
+        mask_prob (float): Probability for each token to be chosen as start of the span to be masked.
+            This will be multiplied by number of timesteps divided by length of mask span to mask
+            approximately this percentage of all elements. However due to overlaps, the actual number
+            will be smaller (unless no_overlap is True).
+        mask_selection (str): How to choose the mask length.
+            Options: [``static``, ``uniform``, ``normal``, ``poisson``].
+        mask_other (float): Secondary mask argument (used for more complex distributions).
+        mask_length (int): The lengths of the mask.
+        no_mask_overlap (bool):  Whether to allow masks to overlap.
+        mask_min_space (int):  Minimum space between spans (if no overlap is enabled).
+        mask_channel_prob (float): The probability of replacing a feature with 0.
+        mask_channel_selection (str): How to choose the mask length for channel masking.
+            Options: [``static``, ``uniform``, ``normal``, ``poisson``].
+        mask_channel_other (float): Secondary mask argument for channel masking(used for more complex distributions).
+        mask_channel_length (int): Minimum space between spans (if no overlap is enabled) for channel masking.
+        no_mask_channel_overlap (bool):  Whether to allow channel masks to overlap.
+        mask_channel_min_space (int): Minimum space between spans for channel masking(if no overlap is enabled).
+    """
+
+    def __init__(
+        self,
+        encoder_embed_dim: int,
+        mask_prob: float,
+        mask_selection: str,
+        mask_other: float,
+        mask_length: int,
+        no_mask_overlap: bool,
+        mask_min_space: int,
+        mask_channel_prob: float,
+        mask_channel_selection: str,
+        mask_channel_other: float,
+        mask_channel_length: int,
+        no_mask_channel_overlap: bool,
+        mask_channel_min_space: int,
+    ):
+        super().__init__()
+        self.mask_prob = mask_prob
+        self.mask_selection = mask_selection
+        self.mask_other = mask_other
+        self.mask_length = mask_length
+        self.no_mask_overlap = no_mask_overlap
+        self.mask_min_space = mask_min_space
+        self.mask_channel_prob = mask_channel_prob
+        self.mask_channel_selection = mask_channel_selection
+        self.mask_channel_other = mask_channel_other
+        self.mask_channel_length = mask_channel_length
+        self.no_mask_channel_overlap = no_mask_channel_overlap
+        self.mask_channel_min_space = mask_channel_min_space
+        self.mask_embedding = Parameter(torch.FloatTensor(encoder_embed_dim))
+        torch.nn.init.uniform_(self.mask_embedding)
+
+    def forward(self, x: Tensor, padding_mask: Optional[Tensor]) -> Tensor:
+        """
+        Args:
+            x (Tensor): The encoded representations after feature extraction module.
+            padding_mask (Tensor or None): The padding mask of the same dimension as shape,
+                which will prevent masking padded elements.
+
+        Returns:
+            Tensor: The feature representations after masking.
+            Tensor: The generated mask indices.
+        """
+        B, T, C = x.shape
+        if self.mask_prob > 0:
+            mask_indices = _compute_mask_indices(
+                (B, T),
+                padding_mask,
+                self.mask_prob,
+                self.mask_length,
+                self.mask_selection,
+                self.mask_other,
+                min_masks=2,
+                no_overlap=self.no_mask_overlap,
+                min_space=self.mask_min_space,
+            )
+            mask_indices = mask_indices.to(x.device)
+            # change dtype of mask_embedding to x for mixed-precision training.
+            # see https://github.com/pytorch/audio/issues/2847 for details.
+            x[mask_indices] = self.mask_embedding.to(x.dtype)
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0:
+            mask_channel_indices = _compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = mask_channel_indices.to(x.device).unsqueeze(1).expand(-1, T, -1)
+            x[mask_channel_indices] = 0
+
+        return x, mask_indices
+
+
+def _compute_logits(
+    proj_x: Tensor,
+    target: Tensor,
+    label_embeddings: Parameter,
+) -> Tensor:
+    """Compute the logits of the embeddings.
+    Args:
+        proj_x (Tensor): The projected masked representations of dimension `[batch, frame, final_dim]`.
+        target (Tensor): The target Tensor of dimension `[batch, frame, final_dim]`.
+        label_embeddings (Parameter): The trainable embeddings of target of dimension `[num_class, final_dim]`.
+
+    Returns:
+        (Tensor): The logits of the inputs.
+    """
+    logit_temp = 0.1
+    pos = torch.index_select(label_embeddings, 0, target.long())
+    negs = label_embeddings.unsqueeze(1).expand(-1, proj_x.size(0), -1)
+    neg_is_pos = (pos == negs).all(-1)
+    pos = pos.unsqueeze(0)
+    targets = torch.cat([pos, negs], dim=0)
+
+    logits = torch.cosine_similarity(proj_x.float(), targets.float(), dim=-1).type_as(proj_x)
+    logits /= logit_temp
+    if neg_is_pos.any():
+        logits[1:][neg_is_pos] = float("-inf")
+    logits = logits.transpose(0, 1)  # (num_x, num_cls+1)
+    return logits
+
+
+class LogitGenerator(Module):
+    """Generate the logits of masked and unmasked inputs.
+    Args:
+        encoder_embed_dim (int): The dimension of the transformer embedding output.
+        num_classes (int): The number of classes in the labels.
+        final_dim (int): Project final representations and targets to `final_dim`.
+        skip_masked (bool): If True, skip computing losses over masked frames.
+        skip_nomask (bool): If True, skip computing losses over unmasked frames.
+    """
+
+    def __init__(
+        self,
+        encoder_embed_dim: int,
+        num_classes: int,
+        final_dim: int,
+        skip_masked: bool,
+        skip_nomask: bool,
+    ):
+        super().__init__()
+        self.label_embeddings = Parameter(torch.FloatTensor(num_classes, final_dim))
+        torch.nn.init.uniform_(self.label_embeddings)
+        self.final_proj = torch.nn.Linear(encoder_embed_dim, final_dim)
+        self.skip_masked = skip_masked
+        self.skip_nomask = skip_nomask
+
+    def forward(self, x: Tensor, label: Tensor, mask_m: Tensor, mask_u: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            x (Tensor): The feature representation of the last transformer layer.
+            label (Tensor): The label Tensor of dimension `[batch, frame]`.
+            mask_m (Tensor): The masked indices of dimension `[batch, frame]`.
+            mask_u (Tensor): The unmasked indices of dimension `[batch, frame]`.
+
+        Returns:
+            Tensor: The logits of masked frames. Tensor of dimension `[masked_frame, final_dim]`.
+            Tensor: The logits of unmasked frames. Tensor of dimension `[unmasked_frame, final_dim]`.
+        """
+        proj_x = self.final_proj(x)
+        if self.skip_masked:
+            logit_m = None
+        else:
+            proj_x_m = proj_x[mask_m]
+            label_m = label[mask_m]
+            logit_m = _compute_logits(proj_x_m, label_m, self.label_embeddings)
+
+        if self.skip_nomask:
+            logit_u = None
+        else:
+            proj_x_u = proj_x[mask_u]
+            label_u = label[mask_u]
+            logit_u = _compute_logits(proj_x_u, label_u, self.label_embeddings)
+        return logit_m, logit_u
+
+
+class GradMultiply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.new(x)
+        return res
+
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/model.py b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c4d7b1ad1651af8a3ec69215e9c885dbe240e75
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/model.py
@@ -0,0 +1,1579 @@
+import math
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+from torch.nn import Module
+
+from . import components
+
+
+class Wav2Vec2Model(Module):
+    """Acoustic model used in *wav2vec 2.0* :cite:`baevski2020wav2vec`.
+
+    Note:
+        To build the model, please use one of the factory functions.
+
+    See Also:
+        * :class:`torchaudio.pipelines.Wav2Vec2Bundle`: Pretrained models (without fine-tuning)
+        * :class:`torchaudio.pipelines.Wav2Vec2ASRBundle`: ASR pipelines with pretrained models.
+
+    Args:
+        feature_extractor (torch.nn.Module):
+            Feature extractor that extracts feature vectors from raw audio Tensor.
+
+        encoder (torch.nn.Module):
+            Encoder that converts the audio features into the sequence of probability
+            distribution (in negative log-likelihood) over labels.
+
+        aux (torch.nn.Module or None, optional):
+            Auxiliary module. If provided, the output from encoder is passed to this module.
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        feature_extractor: Module,
+        encoder: Module,
+        aux: Optional[Module] = None,
+    ):
+        super().__init__()
+        self.feature_extractor = feature_extractor
+        self.encoder = encoder
+        self.aux = aux
+
+    @torch.jit.export
+    def extract_features(
+        self,
+        waveforms: Tensor,
+        lengths: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> Tuple[List[Tensor], Optional[Tensor]]:
+        """Extract feature vectors from raw waveforms
+
+        This returns the list of outputs from the intermediate layers of
+        transformer block in encoder.
+
+        Args:
+            waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
+            lengths (Tensor or None, optional):
+                Indicates the valid length of each audio in the batch.
+                Shape: `(batch, )`.
+                When the ``waveforms`` contains audios with different durations,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths and apply proper mask in
+                transformer attention layer.
+                If ``None``, it is assumed that the entire audio waveform
+                length is valid.
+            num_layers (int or None, optional):
+                If given, limit the number of intermediate layers to go through.
+                Providing `1` will stop the computation after going through one
+                intermediate layers. If not given, the outputs from all the
+                intermediate layers are returned.
+
+        Returns:
+            (List[Tensor], Optional[Tensor]):
+            List of Tensors
+                Features from requested layers.
+                Each Tensor is of shape: `(batch, time frame, feature dimension)`
+            Tensor or None
+                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
+                is returned.
+                It indicates the valid length in time axis of each feature Tensor.
+        """
+        x, lengths = self.feature_extractor(waveforms, lengths)
+        x = self.encoder.extract_features(x, lengths, num_layers)
+        return x, lengths
+
+    def forward(
+        self,
+        waveforms: Tensor,
+        lengths: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Compute the sequence of probability distribution over labels.
+
+        Args:
+            waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
+            lengths (Tensor or None, optional):
+                Indicates the valid length of each audio in the batch.
+                Shape: `(batch, )`.
+                When the ``waveforms`` contains audios with different durations,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths and apply proper mask in
+                transformer attention layer.
+                If ``None``, it is assumed that all the audio in ``waveforms``
+                have valid length. Default: ``None``.
+
+        Returns:
+            (Tensor, Optional[Tensor]):
+            Tensor
+                The sequences of probability distribution (in logit) over labels.
+                Shape: `(batch, frames, num labels)`.
+            Tensor or None
+                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
+                is returned.
+                It indicates the valid length in time axis of the output Tensor.
+        """
+        x, lengths = self.feature_extractor(waveforms, lengths)
+        x = self.encoder(x, lengths)
+        if self.aux is not None:
+            x = self.aux(x)
+        return x, lengths
+
+
+class HuBERTPretrainModel(Module):
+    """HuBERTPretrainModel()
+
+    HuBERT model used for pretraining in *HuBERT* :cite:`hsu2021hubert`.
+
+    Note:
+        To build the model, please use one of the factory functions.
+
+    See Also:
+        `HuBERT Pre-training and Fine-tuning Recipes
+        <https://github.com/pytorch/audio/tree/main/examples/hubert>`__
+
+    Args:
+        wav2vec2 (Wav2Vec2Model):
+            Wav2Vec2 encoder that generates the transformer outputs.
+
+        mask_generator (torch.nn.Module):
+            Mask generator that generates the mask for masked prediction during the training.
+
+        logit_generator (torch.nn.Module):
+            Logit generator that predicts the logits of the masked and unmasked inputs.
+
+        feature_grad_mult (float or None):
+            The factor to scale the convolutional feature extraction layer gradients by.
+            If ``None``, the gradients of feature extraction layers are not affected.
+            The scale factor will not affect the forward pass.
+    """
+
+    def __init__(
+        self,
+        wav2vec2: Wav2Vec2Model,
+        mask_generator: Module,
+        logit_generator: Module,
+        feature_grad_mult: Optional[float],
+    ):
+        super().__init__()
+        self.wav2vec2 = wav2vec2
+        self.mask_generator = mask_generator
+        self.logit_generator = logit_generator
+        if feature_grad_mult is not None and not 0.0 < feature_grad_mult < 1.0:
+            raise ValueError(
+                f"The value of `feature_grad_mult` must be ``None``or between (0, 1). Found {feature_grad_mult}"
+            )
+        self.feature_grad_mult = feature_grad_mult
+
+    def forward(
+        self,
+        waveforms: Tensor,
+        labels: Tensor,
+        audio_lengths: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Compute the sequence of probability distribution over labels.
+
+        Args:
+            waveforms (Tensor): Audio tensor of dimension `[batch, frames]`.
+            labels (Tensor): Label for pre-training. A Tensor of dimension `[batch, frames]`.
+            audio_lengths (Tensor or None, optional):
+                Indicates the valid length of each audio in the batch.
+                Shape: `[batch, ]`.
+                When the ``waveforms`` contains audios with different durations,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths and apply proper mask in
+                transformer attention layer.
+                If ``None``, it is assumed that all the audio in ``waveforms``
+                have valid length. Default: ``None``.
+
+        Returns:
+            (Tensor, Tensor, Tensor):
+            Tensor
+                The masked sequences of probability distribution (in logit).
+                Shape: `(masked_frames, num labels)`.
+            Tensor
+                The unmasked sequence of probability distribution (in logit).
+                Shape: `(unmasked_frames, num labels)`.
+            Tensor
+                The feature mean value for additional penalty loss.
+                Shape: `(1,)`.
+        """
+        x, lengths = self.wav2vec2.feature_extractor(waveforms, audio_lengths)
+        if self.feature_grad_mult is not None and self.feature_grad_mult < 1.0:
+            x = components.GradMultiply.apply(x, self.feature_grad_mult)
+        features_pen = x.float().pow(2).mean()
+        if lengths is not None:
+            padding_mask = components._get_padding_mask(x, lengths)
+        else:
+            padding_mask = None
+        x, attention_mask = self.wav2vec2.encoder._preprocess(x, lengths)
+        x, mask = self.mask_generator(x, padding_mask)
+        x = self.wav2vec2.encoder.transformer(x, attention_mask=attention_mask)
+        if x.shape[1] != labels.shape[1]:
+            raise ValueError("The length of label must match that of HuBERT model output")
+        if padding_mask is not None:
+            mask_m = torch.logical_and(~padding_mask, mask)
+            mask_u = torch.logical_and(~padding_mask, ~mask_m)
+        else:
+            mask_m = mask
+            mask_u = ~mask_m
+
+        logit_m, logit_u = self.logit_generator(x, labels, mask_m, mask_u)
+
+        return logit_m, logit_u, features_pen
+
+
+def wav2vec2_model(
+    extractor_mode: str,
+    extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]],
+    extractor_conv_bias: bool,
+    encoder_embed_dim: int,
+    encoder_projection_dropout: float,
+    encoder_pos_conv_kernel: int,
+    encoder_pos_conv_groups: int,
+    encoder_num_layers: int,
+    encoder_num_heads: int,
+    encoder_attention_dropout: float,
+    encoder_ff_interm_features: int,
+    encoder_ff_interm_dropout: float,
+    encoder_dropout: float,
+    encoder_layer_norm_first: bool,
+    encoder_layer_drop: float,
+    aux_num_out: Optional[int],
+) -> Wav2Vec2Model:
+    """Builds custom :class:`~torchaudio.models.Wav2Vec2Model`.
+
+    Note:
+        The "feature extractor" below corresponds to
+        `ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__
+        in the original ``fairseq`` implementation.
+        This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0*
+        :cite:`baevski2020wav2vec` paper.
+
+        The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__,
+        and this is referred as "Transformer" in the paper.
+
+    Args:
+        extractor_mode (str): Operation mode of feature extractor.
+            Valid values are ``"group_norm"`` or ``"layer_norm"``.
+            If ``"group_norm"``, then a single normalization is applied
+            in the first convolution block. Otherwise, all the convolution
+            blocks will have layer normalization.
+
+            This option corresponds to ``extractor_mode`` from ``fairseq``.
+        extractor_conv_layer_config (list of integer tuples or None):
+            Configuration of convolution layers in feature extractor.
+            List of convolution configuration,
+            i.e. ``[(output_channel, kernel_size, stride), ...]``
+
+            If ``None`` is provided, then the following default value is used.
+
+            .. code-block:: python
+
+               [
+                 (512, 10, 5),
+                 (512, 3, 2),
+                 (512, 3, 2),
+                 (512, 3, 2),
+                 (512, 3, 2),
+                 (512, 2, 2),
+                 (512, 2, 2),
+               ]
+
+            This option corresponds to ``conv_feature_layers`` from ``fairseq``.
+
+        extractor_conv_bias (bool):
+            Whether to include bias term to each convolution operation.
+
+            This option corresponds to ``conv_bias`` from ``fairseq``.
+
+        encoder_embed_dim (int):
+            The dimension of embedding in encoder.
+
+            This option corresponds to ``encoder_embed_dim`` from ``fairseq``.
+
+        encoder_projection_dropout (float):
+            The dropout probability applied after the input feature is projected
+            to ``encoder_embed_dim``.
+
+            This option corresponds to ``dropout_input`` from ``fairseq``.
+
+        encoder_pos_conv_kernel (int):
+            The kernel size of convolutional positional embeddings.
+
+            This option corresponds to ``conv_pos`` from ``fairseq``.
+
+        encoder_pos_conv_groups (int):
+            The number of groups of convolutional positional embeddings.
+
+            This option corresponds to ``conv_pos_groups`` from ``fairseq``.
+
+        encoder_num_layers (int):
+            The number of self attention layers in transformer block.
+
+            This option corresponds to ``encoder_layers`` from ``fairseq``.
+
+        encoder_num_heads (int):
+            The number of heads in self attention layers.
+
+            This option corresponds to ``encoder_attention_heads`` from ``fairseq``.
+
+        encoder_attention_dropout (float):
+            The dropout probability applied after softmax in self-attention layer.
+
+            This option corresponds to ``attention_dropout`` from ``fairseq``.
+
+        encoder_ff_interm_features (int):
+            The dimension of hidden features in feed forward layer.
+
+            This option corresponds to ``encoder_ffn_embed_dim`` from ``fairseq``.
+
+        encoder_ff_interm_dropout (float):
+            The dropout probability applied in feedforward layer.
+
+            This option correspinds to ``activation_dropout`` from ``fairseq``.
+
+        encoder_dropout (float):
+            The dropout probability applied at the end of feed forward layer.
+
+            This option corresponds to ``dropout`` from ``fairseq``.
+
+        encoder_layer_norm_first (bool):
+            Control the order of layer norm in transformer layer and each encoder layer.
+            If True, in transformer layer, layer norm is applied before features are fed
+            to encoder layers. In encoder layer, two layer norms are applied before and after
+            self attention.
+            If False, in transformer layer, layer norm is applied after features are fed
+            to encoder layers. In encoder layer, two layer norms are applied after self
+            attention, before and after feed forward.
+
+            This option corresponds to ``layer_norm_first`` from ``fairseq``.
+
+        encoder_layer_drop (float):
+            Probability to drop each encoder layer during training.
+
+            This option corresponds to ``layerdrop`` from ``fairseq``.
+
+        aux_num_out (int or None):
+            When provided, attach an extra linear layer on top of encoder, which can be
+            used for fine-tuning.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    if extractor_conv_layer_config is None:
+        extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2
+
+    feature_extractor = components._get_feature_extractor(
+        extractor_mode, extractor_conv_layer_config, extractor_conv_bias
+    )
+    encoder = components._get_encoder(
+        in_features=extractor_conv_layer_config[-1][0],
+        embed_dim=encoder_embed_dim,
+        dropout_input=encoder_projection_dropout,
+        pos_conv_kernel=encoder_pos_conv_kernel,
+        pos_conv_groups=encoder_pos_conv_groups,
+        num_layers=encoder_num_layers,
+        num_heads=encoder_num_heads,
+        attention_dropout=encoder_attention_dropout,
+        ff_interm_features=encoder_ff_interm_features,
+        ff_interm_dropout=encoder_ff_interm_dropout,
+        dropout=encoder_dropout,
+        layer_norm_first=encoder_layer_norm_first,
+        layer_drop=encoder_layer_drop,
+    )
+    aux = None
+    if aux_num_out is not None:
+        aux = torch.nn.Linear(in_features=encoder_embed_dim, out_features=aux_num_out)
+    return Wav2Vec2Model(feature_extractor, encoder, aux)
+
+
+def wav2vec2_base(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.1,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.1,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds "base" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec`
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="group_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=768,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=12,
+        encoder_num_heads=12,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=3072,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+
+
+def wav2vec2_large(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.1,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.1,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds "large" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec`
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="group_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+
+
+def wav2vec2_large_lv60k(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.1,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.1,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds "large lv-60k" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec`
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=True,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+
+
+def hubert_base(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.05,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds "base" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert`
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="group_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=768,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=12,
+        encoder_num_heads=12,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=3072,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+
+
+def hubert_large(
+    encoder_projection_dropout: float = 0.0,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.0,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds "large" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert`
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+
+
+def hubert_xlarge(
+    encoder_projection_dropout: float = 0.0,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.0,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds "extra large" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert`
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1280,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=48,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=5120,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+
+
+def _init_hubert_pretrain_model(module):
+    if isinstance(module, components.ConvLayerBlock):
+        torch.nn.init.kaiming_normal_(module.conv.weight)
+    elif isinstance(module, components.ConvolutionalPositionalEmbedding):
+        # normalize the weight to normal distribution.
+        std = math.sqrt(4.0 / (module.embed_dim * module.kernel_size))
+        torch.nn.init.normal_(module.conv.weight, mean=0.0, std=std)
+        torch.nn.init.constant_(module.conv.bias, 0.0)
+    elif isinstance(module, components.SelfAttention):
+        # normalize the query, key, value, and out_proj parameters in self attention module.
+        torch.nn.init.xavier_uniform_(module.k_proj.weight, gain=1 / math.sqrt(2))
+        torch.nn.init.xavier_uniform_(module.v_proj.weight, gain=1 / math.sqrt(2))
+        torch.nn.init.xavier_uniform_(module.q_proj.weight, gain=1 / math.sqrt(2))
+        torch.nn.init.xavier_uniform_(module.out_proj.weight)
+        torch.nn.init.constant_(module.out_proj.bias, 0.0)
+    elif isinstance(module, components.Transformer):
+        module.apply(components._init_transformer_params)
+    else:
+        pass
+
+
+def hubert_pretrain_model(
+    extractor_mode: str,
+    extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]],
+    extractor_conv_bias: bool,
+    encoder_embed_dim: int,
+    encoder_projection_dropout: float,
+    encoder_pos_conv_kernel: int,
+    encoder_pos_conv_groups: int,
+    encoder_num_layers: int,
+    encoder_num_heads: int,
+    encoder_attention_dropout: float,
+    encoder_ff_interm_features: int,
+    encoder_ff_interm_dropout: float,
+    encoder_dropout: float,
+    encoder_layer_norm_first: bool,
+    encoder_layer_drop: float,
+    mask_prob: float,
+    mask_selection: str,
+    mask_other: float,
+    mask_length: int,
+    no_mask_overlap: bool,
+    mask_min_space: int,
+    mask_channel_prob: float,
+    mask_channel_selection: str,
+    mask_channel_other: float,
+    mask_channel_length: int,
+    no_mask_channel_overlap: bool,
+    mask_channel_min_space: int,
+    skip_masked: bool,
+    skip_nomask: bool,
+    num_classes: int,
+    final_dim: int,
+    feature_grad_mult: Optional[float],
+) -> HuBERTPretrainModel:
+    """Builds custom :class:`HuBERTPretrainModel` for training from scratch
+
+    Note:
+        The "feature extractor" below corresponds to
+        `ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__
+        in the original ``fairseq`` implementation.
+        This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0*
+        :cite:`baevski2020wav2vec` paper.
+
+        The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__,
+        and this is referred as "Transformer" in the paper.
+
+    Args:
+        extractor_mode (str): Operation mode of feature extractor.
+            Valid values are ``"group_norm"`` or ``"layer_norm"``.
+            If ``"group_norm"``, then a single normalization is applied
+            in the first convolution block. Otherwise, all the convolution
+            blocks will have layer normalization.
+
+            This option corresponds to ``extractor_mode`` from ``fairseq``.
+
+        extractor_conv_layer_config (list of integer tuples or None):
+            Configuration of convolution layers in feature extractor.
+            List of convolution configuration,
+            i.e. ``[(output_channel, kernel_size, stride), ...]``
+
+            If ``None`` is provided, then the following default value is used.
+
+            .. code-block:: python
+
+               [
+                 (512, 10, 5),
+                 (512, 3, 2),
+                 (512, 3, 2),
+                 (512, 3, 2),
+                 (512, 3, 2),
+                 (512, 2, 2),
+                 (512, 2, 2),
+               ]
+
+            This option corresponds to ``conv_feature_layers`` from ``fairseq``.
+
+        extractor_conv_bias (bool):
+            Whether to include bias term to each convolution operation.
+
+            This option corresponds to ``conv_bias`` from ``fairseq``.
+
+        encoder_embed_dim (int):
+            The dimension of embedding in encoder.
+
+            This option corresponds to ``encoder_embed_dim`` from ``fairseq``.
+
+        encoder_projection_dropout (float):
+            The dropout probability applied after the input feature is projected
+            to ``encoder_embed_dim``.
+
+            This option corresponds to ``dropout_input`` from ``fairseq``.
+
+        encoder_pos_conv_kernel (int):
+            The kernel size of convolutional positional embeddings.
+
+            This option corresponds to ``conv_pos`` from ``fairseq``.
+
+        encoder_pos_conv_groups (int):
+            The number of groups of convolutional positional embeddings.
+
+            This option corresponds to ``conv_pos_groups`` from ``fairseq``.
+
+        encoder_num_layers (int):
+            The number of self attention layers in transformer block.
+
+            This option corresponds to ``encoder_layers`` from ``fairseq``.
+
+        encoder_num_heads (int):
+            The number of heads in self attention layers.
+
+            This option corresponds to ``encoder_attention_heads`` from ``fairseq``.
+
+        encoder_attention_dropout (float):
+            The dropout probability applied after softmax in self-attention layer.
+
+            This option corresponds to ``attention_dropout`` from ``fairseq``.
+
+        encoder_ff_interm_features (int):
+            The dimension of hidden features in feed forward layer.
+
+            This option corresponds to ``encoder_ffn_embed_dim`` from ``fairseq``.
+
+        encoder_ff_interm_dropout (float):
+            The dropout probability applied in feedforward layer.
+
+            This option correspinds to ``activation_dropout`` from ``fairseq``.
+
+        encoder_dropout (float):
+            The dropout probability applied at the end of feed forward layer.
+
+            This option corresponds to ``dropout`` from ``fairseq``.
+
+        encoder_layer_norm_first (bool):
+            Control the order of layer norm in transformer layer and each encoder layer.
+            If True, in transformer layer, layer norm is applied before features are fed
+            to encoder layers. In encoder layer, two layer norms are applied before and after
+            self attention.
+            If False, in transformer layer, layer norm is applied after features are fed
+            to encoder layers. In encoder layer, two layer norms are applied after self
+            attention, before and after feed forward.
+
+            This option corresponds to ``layer_norm_first`` from ``fairseq``.
+
+        encoder_layer_drop (float):
+            Probability to drop each encoder layer during training.
+
+            This option corresponds to ``layerdrop`` from ``fairseq``.
+
+        mask_prob (float):
+            Probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            However due to overlaps, the actual number will be smaller (unless no_overlap is True).
+
+            This option corresponds to ``mask_prob`` from ``fairseq``.
+
+        mask_selection (str):
+            How to choose the mask length. Options: [``static``, ``uniform``, ``normal``, ``poisson``].
+
+            This option corresponds to ``mask_selection`` from ``fairseq``.
+
+        mask_other (float):
+            Secondary mask argument (used for more complex distributions).
+
+            This option corresponds to ``mask_other`` from ``fairseq``.
+
+        mask_length (int):
+            The lengths of the mask.
+
+            This option corresponds to ``mask_length`` from ``fairseq``.
+
+        no_mask_overlap (bool):
+            Whether to allow masks to overlap.
+
+            This option corresponds to ``no_mask_overlap`` from ``fairseq``.
+
+        mask_min_space (int):
+            Minimum space between spans (if no overlap is enabled).
+
+            This option corresponds to ``mask_min_space`` from ``fairseq``.
+
+        mask_channel_prob: (float):
+            The probability of replacing a feature with 0.
+
+            This option corresponds to ``mask_channel_prob`` from ``fairseq``.
+
+        mask_channel_selection (str):
+            How to choose the mask length for channel masking. Options: [``static``, ``uniform``, ``normal``, ``poisson``].
+
+            This option corresponds to ``mask_channel_selection`` from ``fairseq``.
+
+        mask_channel_other (float):
+            Secondary mask argument for channel masking(used for more complex distributions).
+
+            This option corresponds to ``mask_channel_other`` from ``fairseq``.
+
+        mask_channel_length (int):
+            Minimum space between spans (if no overlap is enabled) for channel masking.
+
+            This option corresponds to ``mask_channel_length`` from ``fairseq``.
+
+        no_mask_channel_overlap (bool):
+            Whether to allow channel masks to overlap.
+
+            This option corresponds to ``no_mask_channel_overlap`` from ``fairseq``.
+
+        mask_channel_min_space (int):
+            Minimum space between spans for channel masking(if no overlap is enabled).
+
+            This option corresponds to ``mask_channel_min_space`` from ``fairseq``.
+
+        skip_masked (bool):
+            If True, skip computing losses over masked frames.
+
+            This option corresponds to ``skip_masked`` from ``fairseq``.
+
+        skip_nomask (bool):
+            If True, skip computing losses over unmasked frames.
+
+            This option corresponds to ``skip_nomask`` from ``fairseq``.
+
+        num_classes (int):
+            The number of classes in the labels.
+
+        final_dim (int):
+            Project final representations and targets to `final_dim`.
+
+            This option corresponds to ``final_dim`` from ``fairseq``.
+
+        feature_grad_mult (float or None):
+            The factor to scale the convolutional feature extraction layer gradients by.
+            The scale factor will not affect the forward pass.
+
+            This option corresponds to ``feature_grad_mult`` from ``fairseq``.
+
+    Returns:
+        HuBERTPretrainModel:
+            The resulting model.
+    """  # noqa: E501
+    if extractor_conv_layer_config is None:
+        extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2
+
+    feature_extractor = components._get_feature_extractor(
+        extractor_mode, extractor_conv_layer_config, extractor_conv_bias
+    )
+    encoder = components._get_encoder(
+        in_features=extractor_conv_layer_config[-1][0],
+        embed_dim=encoder_embed_dim,
+        dropout_input=encoder_projection_dropout,
+        pos_conv_kernel=encoder_pos_conv_kernel,
+        pos_conv_groups=encoder_pos_conv_groups,
+        num_layers=encoder_num_layers,
+        num_heads=encoder_num_heads,
+        attention_dropout=encoder_attention_dropout,
+        ff_interm_features=encoder_ff_interm_features,
+        ff_interm_dropout=encoder_ff_interm_dropout,
+        dropout=encoder_dropout,
+        layer_norm_first=encoder_layer_norm_first,
+        layer_drop=encoder_layer_drop,
+    )
+    wav2vec2 = Wav2Vec2Model(feature_extractor, encoder)
+    mask_generator = components.MaskGenerator(
+        encoder_embed_dim,
+        mask_prob,
+        mask_selection,
+        mask_other,
+        mask_length,
+        no_mask_overlap,
+        mask_min_space,
+        mask_channel_prob,
+        mask_channel_selection,
+        mask_channel_other,
+        mask_channel_length,
+        no_mask_channel_overlap,
+        mask_channel_min_space,
+    )
+    logit_generator = components.LogitGenerator(
+        encoder_embed_dim,
+        num_classes,
+        final_dim,
+        skip_masked,
+        skip_nomask,
+    )
+    model = HuBERTPretrainModel(
+        wav2vec2=wav2vec2,
+        mask_generator=mask_generator,
+        logit_generator=logit_generator,
+        feature_grad_mult=feature_grad_mult,
+    )
+    # initialize the model for pre-training
+    model.apply(_init_hubert_pretrain_model)
+    return model
+
+
+def hubert_pretrain_base(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.05,
+    mask_prob: float = 0.8,
+    mask_channel_prob: float = 0.0,
+    mask_channel_length: int = 10,
+    feature_grad_mult: Optional[float] = 0.1,
+    num_classes: int = 100,
+) -> HuBERTPretrainModel:
+    """Builds "base" :class:`HuBERTPretrainModel` from *HuBERT* :cite:`hsu2021hubert` for pretraining.
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_layer_drop (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_length (int):
+            See :py:func:`hubert_pretrain_model`.
+        feature_grad_mult (float or None):
+            See :py:func:`hubert_pretrain_model`.
+        num_classes (int, optional):
+            See :py:func:`hubert_pretrain_model`.
+
+    Returns:
+        HuBERTPretrainModel:
+            The resulting model.
+    """  # noqa: E501
+    return hubert_pretrain_model(
+        extractor_mode="group_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=768,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=12,
+        encoder_num_heads=12,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=3072,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=encoder_layer_drop,
+        mask_prob=mask_prob,
+        mask_selection="static",
+        mask_other=0.0,
+        mask_length=10,
+        no_mask_overlap=False,
+        mask_min_space=1,
+        mask_channel_prob=mask_channel_prob,
+        mask_channel_selection="static",
+        mask_channel_other=0.0,
+        mask_channel_length=mask_channel_length,
+        no_mask_channel_overlap=False,
+        mask_channel_min_space=1,
+        skip_masked=False,
+        skip_nomask=False,
+        num_classes=num_classes,
+        final_dim=256,
+        feature_grad_mult=feature_grad_mult,
+    )
+
+
+def hubert_pretrain_large(
+    encoder_projection_dropout: float = 0.0,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.0,
+    mask_prob: float = 0.8,
+    mask_channel_prob: float = 0.0,
+    mask_channel_length: int = 10,
+    feature_grad_mult: Optional[float] = None,
+) -> HuBERTPretrainModel:
+    """Builds "large" :class:`HuBERTPretrainModel` from *HuBERT* :cite:`hsu2021hubert` for pretraining.
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_layer_drop (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_length (int):
+            See :py:func:`hubert_pretrain_model`.
+        feature_grad_mult (float or None):
+            See :py:func:`hubert_pretrain_model`.
+
+    Returns:
+        HuBERTPretrainModel:
+            The resulting model.
+    """  # noqa: E501
+    return hubert_pretrain_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        mask_prob=mask_prob,
+        mask_selection="static",
+        mask_other=0.0,
+        mask_length=10,
+        no_mask_overlap=False,
+        mask_min_space=1,
+        mask_channel_prob=mask_channel_prob,
+        mask_channel_selection="static",
+        mask_channel_other=0.0,
+        mask_channel_length=mask_channel_length,
+        no_mask_channel_overlap=False,
+        mask_channel_min_space=1,
+        skip_masked=False,
+        skip_nomask=False,
+        num_classes=500,
+        final_dim=768,
+        feature_grad_mult=feature_grad_mult,
+    )
+
+
+def hubert_pretrain_xlarge(
+    encoder_projection_dropout: float = 0.0,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.0,
+    mask_prob: float = 0.8,
+    mask_channel_prob: float = 0.0,
+    mask_channel_length: int = 10,
+    feature_grad_mult: Optional[float] = None,
+) -> HuBERTPretrainModel:
+    """Builds "extra large" :class:`HuBERTPretrainModel` from *HuBERT* :cite:`hsu2021hubert` for pretraining.
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_dropout (float):
+            See :py:func:`hubert_pretrain_model`.
+        encoder_layer_drop (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_length (int):
+            See :py:func:`hubert_pretrain_model`.
+        feature_grad_mult (float or None):
+            See :py:func:`hubert_pretrain_model`.
+
+    Returns:
+        HuBERTPretrainModel:
+            The resulting model.
+    """  # noqa: E501
+    return hubert_pretrain_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1280,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=48,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=5120,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        mask_prob=mask_prob,
+        mask_selection="static",
+        mask_other=0.0,
+        mask_length=10,
+        no_mask_overlap=False,
+        mask_min_space=1,
+        mask_channel_prob=mask_channel_prob,
+        mask_channel_selection="static",
+        mask_channel_other=0.0,
+        mask_channel_length=mask_channel_length,
+        no_mask_channel_overlap=False,
+        mask_channel_min_space=1,
+        skip_masked=False,
+        skip_nomask=False,
+        num_classes=500,
+        final_dim=1024,
+        feature_grad_mult=feature_grad_mult,
+    )
+
+
+def wavlm_model(
+    extractor_mode: str,
+    extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]],
+    extractor_conv_bias: bool,
+    encoder_embed_dim: int,
+    encoder_projection_dropout: float,
+    encoder_pos_conv_kernel: int,
+    encoder_pos_conv_groups: int,
+    encoder_num_layers: int,
+    encoder_num_heads: int,
+    encoder_num_buckets: int,
+    encoder_max_distance: int,
+    encoder_attention_dropout: float,
+    encoder_ff_interm_features: int,
+    encoder_ff_interm_dropout: float,
+    encoder_dropout: float,
+    encoder_layer_norm_first: bool,
+    encoder_layer_drop: float,
+    aux_num_out: Optional[int],
+) -> Wav2Vec2Model:
+    """Builds custom WaveLM model :cite:`chen2022wavlm`. The architecture is compatible
+    with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output object is
+    :class:`~torchaudio.models.Wav2Vec2Model`. Most of the arguments have the same meaning
+    as in :py:func:`~torchaudio.models.wav2vec2_model` so please refer there for documentation.
+
+    Args:
+        extractor_mode (str): Operation mode of feature extractor.
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        extractor_conv_layer_config (list of integer tuples or None):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        extractor_conv_bias (bool):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_embed_dim (int):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_projection_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_pos_conv_kernel (int):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_pos_conv_groups (int):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_num_layers (int):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_num_heads (int):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_num_buckets (int):
+            Number of buckets for relative position embedding.
+        encoder_max_distance (int):
+            Maximum distance for relative position embedding.
+
+        encoder_attention_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_ff_interm_features (int):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_ff_interm_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_layer_norm_first (bool):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        encoder_layer_drop (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+        aux_num_out (int or None):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """
+    if extractor_conv_layer_config is None:
+        extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2
+
+    feature_extractor = components._get_feature_extractor(
+        extractor_mode, extractor_conv_layer_config, extractor_conv_bias
+    )
+    encoder = components._get_wavlm_encoder(
+        in_features=extractor_conv_layer_config[-1][0],
+        embed_dim=encoder_embed_dim,
+        dropout_input=encoder_projection_dropout,
+        pos_conv_kernel=encoder_pos_conv_kernel,
+        pos_conv_groups=encoder_pos_conv_groups,
+        num_layers=encoder_num_layers,
+        num_heads=encoder_num_heads,
+        num_buckets=encoder_num_buckets,
+        max_distance=encoder_max_distance,
+        attention_dropout=encoder_attention_dropout,
+        ff_interm_features=encoder_ff_interm_features,
+        ff_interm_dropout=encoder_ff_interm_dropout,
+        dropout=encoder_dropout,
+        layer_norm_first=encoder_layer_norm_first,
+        layer_drop=encoder_layer_drop,
+    )
+    aux = None
+    if aux_num_out is not None:
+        aux = torch.nn.Linear(in_features=encoder_embed_dim, out_features=aux_num_out)
+    return Wav2Vec2Model(feature_extractor, encoder, aux)
+
+
+def wavlm_base(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.1,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.1,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds "base" WaveLM model :cite:`chen2022wavlm`. The architecture is compatible
+    with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
+    :class:`~torchaudio.models.Wav2Vec2Model`.
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        aux_num_out (int, optional):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """
+    return wavlm_model(
+        extractor_mode="group_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=768,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=12,
+        encoder_num_heads=12,
+        encoder_num_buckets=320,
+        encoder_max_distance=800,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=3072,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+
+
+def wavlm_large(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.1,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds "large" WaveLM model :cite:`chen2022wavlm`. The architecture is compatible
+    with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
+    :class:`~torchaudio.models.Wav2Vec2Model`.
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        aux_num_out (int, optional):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """
+    return wavlm_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_num_buckets=320,
+        encoder_max_distance=800,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+
+
+def wav2vec2_xlsr_300m(
+    encoder_projection_dropout: float = 0.0,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.0,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds XLS-R model :cite:`babu2021xls` with 300 millions of parameters. The architecture is compatible
+    with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
+    :class:`~torchaudio.models.Wav2Vec2Model`.
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        aux_num_out (int, optional):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """
+    return wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=True,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+
+
+def wav2vec2_xlsr_1b(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.0,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds XLS-R model :cite:`babu2021xls` with 1 billion of parameters. The architecture is compatible
+    with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
+    :class:`~torchaudio.models.Wav2Vec2Model`.
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        aux_num_out (int, optional):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """
+    return wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=True,
+        encoder_embed_dim=1280,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=48,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=5120,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+
+
+def wav2vec2_xlsr_2b(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.0,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds XLS-R model :cite:`babu2021xls` with 2 billions of parameters. The architecture is compatible
+    with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
+    :class:`~torchaudio.models.Wav2Vec2Model`.
+
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+        aux_num_out (int, optional):
+            See :py:func:`~torchaudio.models.wav2vec2_model`.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """
+    return wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=True,
+        encoder_embed_dim=1920,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=48,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=7680,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__init__.py b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a536ee2c28b470db9cc6b4f6d1dbfa664b3e17df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__init__.py
@@ -0,0 +1,7 @@
+from .import_fairseq import import_fairseq_model
+from .import_huggingface import import_huggingface_model
+
+__all__ = [
+    "import_huggingface_model",
+    "import_fairseq_model",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d15057842e133442b8fa50d76e0eea64a10eb41
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__pycache__/import_fairseq.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__pycache__/import_fairseq.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2fb4b919eddb7f482a964a2d7ba8d3be2127f7a0
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__pycache__/import_fairseq.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__pycache__/import_huggingface.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__pycache__/import_huggingface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4b31a3ae6ba37b16d53d8f9297bcb2b2e2b4eef
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/__pycache__/import_huggingface.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/import_fairseq.py b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/import_fairseq.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5873446f1553cc6b7bf17a8e421ad1160772b57
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/import_fairseq.py
@@ -0,0 +1,213 @@
+"""Import fariseq's wav2vec2.0 pretrained weights to torchaudios's format.
+
+For this module to work, you need `fairseq`.
+"""
+import re
+
+from torch.nn import Module
+
+from ..model import wav2vec2_model, Wav2Vec2Model
+
+
+def _parse_config(w2v_model):
+    encoder = w2v_model.encoder
+    conv_layers = w2v_model.feature_extractor.conv_layers
+
+    extractor_mode = "layer_norm"
+    if "GroupNorm" in conv_layers[0][2].__class__.__name__:
+        extractor_mode = "group_norm"
+    else:
+        extractor_mode = "layer_norm"
+
+    conv_layer_config = [(l[0].out_channels, l[0].kernel_size[0], l[0].stride[0]) for l in conv_layers]
+
+    if all(l[0].bias is None for l in conv_layers):
+        conv_bias = False
+    elif all(l[0].bias is not None for l in conv_layers):
+        conv_bias = True
+    else:
+        raise ValueError("Either all the convolutions layers have bias term or none of them should.")
+
+    config = {
+        "extractor_mode": extractor_mode,
+        "extractor_conv_layer_config": conv_layer_config,
+        "extractor_conv_bias": conv_bias,
+        "encoder_embed_dim": w2v_model.post_extract_proj.out_features,
+        "encoder_projection_dropout": w2v_model.dropout_input.p,
+        "encoder_pos_conv_kernel": encoder.pos_conv[0].kernel_size[0],
+        "encoder_pos_conv_groups": encoder.pos_conv[0].groups,
+        "encoder_num_layers": len(encoder.layers),
+        "encoder_num_heads": encoder.layers[0].self_attn.num_heads,
+        "encoder_attention_dropout": encoder.layers[0].self_attn.dropout_module.p,
+        "encoder_ff_interm_features": encoder.layers[0].fc1.out_features,
+        "encoder_ff_interm_dropout": encoder.layers[0].dropout2.p,
+        "encoder_dropout": encoder.layers[0].dropout3.p,
+        "encoder_layer_norm_first": encoder.layer_norm_first,
+        "encoder_layer_drop": encoder.layerdrop,
+    }
+    return config
+
+
+def _map_key(key):
+    key_ = key
+    if key.startswith("w2v_model."):
+        key = key.replace("w2v_model.", "")
+    if re.match(r"(mask_emb|quantizer|project_q|final_proj|mask_emb)", key):
+        return None
+    # Feature Extractor
+    # Group norm when "extractor_mode" is "default".
+    # (Only the first layer)
+    # "conv_layers.0.2.weight" -> "conv_layers.0.layer_norm.weight"
+    # "conv_layers.0.2.bias"   -> "conv_layers.0.layer_norm.bias"
+    match = re.match(r"feature_extractor\.conv_layers\.0\.2\.(weight|bias)", key)
+    if match:
+        return f"feature_extractor.conv_layers.0.layer_norm.{match.group(1)}"
+    # Convolutions
+    # "conv_layers.X.0.weight" -> "conv_layers.X.conv.weight"
+    # "conv_layers.X.0.bias"   -> "conv_layers.X.conv.bias"
+    match = re.match(r"feature_extractor\.conv_layers\.(\d+)\.0\.(weight|bias)", key)
+    if match:
+        return f"feature_extractor.conv_layers.{match.group(1)}.conv.{match.group(2)}"
+    # Layer norm when "extractor_mode" is "layer_norm".
+    # "conv_layers.X.2.1.weight" -> "conv_layers.X.layer_norm.weight"
+    # "conv_layers.X.2.1.bias"   -> "conv_layers.X.layer_norm.bias"
+    match = re.match(r"feature_extractor\.conv_layers\.(\d+)\.2\.1\.(weight|bias)", key)
+    if match:
+        return f"feature_extractor.conv_layers.{match.group(1)}.layer_norm.{match.group(2)}"
+    match = re.match(r"post_extract_proj\.(weight|bias)", key)
+    # Encoder - Feature projection
+    if match:
+        return f"encoder.feature_projection.projection.{match.group(1)}"
+    match = re.match(r"layer_norm\.(weight|bias)", key)
+    if match:
+        return f"encoder.feature_projection.layer_norm.{match.group(1)}"
+    # Encoder - Transformer - Convolutional positional embedding
+    match = re.match(r"encoder\.pos_conv\.0\.(bias|weight_g|weight_v)", key)
+    if match:
+        return f"encoder.transformer.pos_conv_embed.conv.{match.group(1)}"
+    match = re.match(r"encoder\.layer_norm\.(weight|bias)", key)
+    if match:
+        return f"encoder.transformer.layer_norm.{match.group(1)}"
+    # Encoder - Transformer - Self attention layers
+    match = re.match(r"encoder\.layers\.(\d+)\.self_attn\.((k_|v_|q_|out_)proj\.(weight|bias))", key)
+    if match:
+        return f"encoder.transformer.layers.{match.group(1)}.attention.{match.group(2)}"
+    match = re.match(r"encoder\.layers\.(\d+)\.self_attn_layer_norm\.(weight|bias)", key)
+    if match:
+        return f"encoder.transformer.layers.{match.group(1)}.layer_norm.{match.group(2)}"
+    match = re.match(r"encoder\.layers\.(\d+)\.fc1\.(weight|bias)", key)
+    if match:
+        return f"encoder.transformer.layers.{match.group(1)}.feed_forward.intermediate_dense.{match.group(2)}"
+    match = re.match(r"encoder\.layers\.(\d+)\.fc2\.(weight|bias)", key)
+    if match:
+        return f"encoder.transformer.layers.{match.group(1)}.feed_forward.output_dense.{match.group(2)}"
+    match = re.match(r"encoder\.layers\.(\d+)\.final_layer_norm\.(weight|bias)", key)
+    if match:
+        return f"encoder.transformer.layers.{match.group(1)}.final_layer_norm.{match.group(2)}"
+    match = re.match(r"proj\.(weight|bias)", key)
+    # Auxiliary Module
+    # Only relevant when loading fine-tuned models
+    if match:
+        return f"aux.{match.group(1)}"
+    # HuBERT Extension
+    if key in ["label_embs_concat"]:
+        return key
+    raise ValueError(f"Unexpected key: {key_}")
+
+
+def _convert_state_dict(state_dict):
+    converted = {}
+    for k, v in state_dict.items():
+        k = _map_key(k)
+        if k is not None:
+            converted[k] = v
+    return converted
+
+
+def import_fairseq_model(original: Module) -> Wav2Vec2Model:
+    """Builds :class:`Wav2Vec2Model` from the corresponding model object of
+    `fairseq <https://github.com/pytorch/fairseq>`_.
+
+    Args:
+        original (torch.nn.Module):
+            An instance of fairseq's Wav2Vec2.0 or HuBERT model.
+            One of ``fairseq.models.wav2vec.wav2vec2_asr.Wav2VecEncoder``,
+            ``fairseq.models.wav2vec.wav2vec2.Wav2Vec2Model`` or
+            ``fairseq.models.hubert.hubert_asr.HubertEncoder``.
+
+    Returns:
+        Wav2Vec2Model: Imported model.
+
+    Example - Loading pretrain-only model
+        >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model
+        >>>
+        >>> # Load model using fairseq
+        >>> model_file = 'wav2vec_small.pt'
+        >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
+        >>> original = model[0]
+        >>> imported = import_fairseq_model(original)
+        >>>
+        >>> # Perform feature extraction
+        >>> waveform, _ = torchaudio.load('audio.wav')
+        >>> features, _ = imported.extract_features(waveform)
+        >>>
+        >>> # Compare result with the original model from fairseq
+        >>> reference = original.feature_extractor(waveform).transpose(1, 2)
+        >>> torch.testing.assert_allclose(features, reference)
+
+    Example - Fine-tuned model
+        >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model
+        >>>
+        >>> # Load model using fairseq
+        >>> model_file = 'wav2vec_small_960h.pt'
+        >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
+        >>> original = model[0]
+        >>> imported = import_fairseq_model(original.w2v_encoder)
+        >>>
+        >>> # Perform encoding
+        >>> waveform, _ = torchaudio.load('audio.wav')
+        >>> emission, _ = imported(waveform)
+        >>>
+        >>> # Compare result with the original model from fairseq
+        >>> mask = torch.zeros_like(waveform)
+        >>> reference = original(waveform, mask)['encoder_out'].transpose(0, 1)
+        >>> torch.testing.assert_allclose(emission, reference)
+    """
+    class_ = original.__class__.__name__
+    if class_ == "Wav2Vec2Model":
+        return _import_wav2vec2_pretraining(original)
+    if class_ == "Wav2VecEncoder":
+        return _import_wav2vec2_finetuning(original)
+    if class_ == "HubertModel":
+        return _import_hubert_pretraining(original)
+    if class_ == "HubertEncoder":
+        return _import_hubert_finetuning(original)
+    raise ValueError(f"Expected an instance of `Wav2Vec2Model` or `Wav2VecEncoder`. Found: {class_}")
+
+
+def _import_wav2vec2_finetuning(original: Module) -> Wav2Vec2Model:
+    config = _parse_config(original.w2v_model)
+    model = wav2vec2_model(**config, aux_num_out=original.proj.out_features)
+    model.load_state_dict(_convert_state_dict(original.state_dict()))
+    return model
+
+
+def _import_wav2vec2_pretraining(original: Module) -> Wav2Vec2Model:
+    config = _parse_config(original)
+    model = wav2vec2_model(**config, aux_num_out=None)
+    model.load_state_dict(_convert_state_dict(original.state_dict()), strict=False)
+    return model
+
+
+def _import_hubert_finetuning(original: Module) -> Wav2Vec2Model:
+    config = _parse_config(original.w2v_model)
+    model = wav2vec2_model(**config, aux_num_out=original.proj.out_features)
+    model.load_state_dict(_convert_state_dict(original.state_dict()), strict=False)
+    return model
+
+
+def _import_hubert_pretraining(original: Module) -> Wav2Vec2Model:
+    config = _parse_config(original)
+    model = wav2vec2_model(**config, aux_num_out=None)
+    model.load_state_dict(_convert_state_dict(original.state_dict()), strict=False)
+    return model
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/import_huggingface.py b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/import_huggingface.py
new file mode 100644
index 0000000000000000000000000000000000000000..38703408f01d52b8259f39921202ccbd19a24a3f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/utils/import_huggingface.py
@@ -0,0 +1,134 @@
+"""Import Hugging Face transformers's wav2vec2.0 pretrained weights to torchaudios's format.
+"""
+import logging
+from typing import Any, Dict
+
+import torch
+from torch.nn import Module
+
+from ..model import wav2vec2_model, Wav2Vec2Model, wavlm_model
+
+_LG = logging.getLogger(__name__)
+
+
+def _get_config(cfg):
+    config = {
+        "extractor_mode": f"{cfg.feat_extract_norm}_norm",
+        "extractor_conv_layer_config": list(zip(cfg.conv_dim, cfg.conv_kernel, cfg.conv_stride)),
+        "extractor_conv_bias": cfg.conv_bias,
+        "encoder_embed_dim": cfg.hidden_size,
+        "encoder_projection_dropout": cfg.feat_proj_dropout,
+        "encoder_pos_conv_kernel": cfg.num_conv_pos_embeddings,
+        "encoder_pos_conv_groups": cfg.num_conv_pos_embedding_groups,
+        "encoder_num_layers": cfg.num_hidden_layers,
+        "encoder_num_heads": cfg.num_attention_heads,
+        "encoder_attention_dropout": cfg.attention_dropout,
+        "encoder_ff_interm_features": cfg.intermediate_size,
+        "encoder_ff_interm_dropout": cfg.activation_dropout,
+        "encoder_dropout": cfg.hidden_dropout,
+        "encoder_layer_norm_first": cfg.do_stable_layer_norm,
+        "encoder_layer_drop": cfg.layerdrop,
+    }
+    return config
+
+
+def _get_config_wavlm(cfg):
+    config = {
+        "extractor_mode": f"{cfg.feat_extract_norm}_norm",
+        "extractor_conv_layer_config": list(zip(cfg.conv_dim, cfg.conv_kernel, cfg.conv_stride)),
+        "extractor_conv_bias": cfg.conv_bias,
+        "encoder_embed_dim": cfg.hidden_size,
+        "encoder_projection_dropout": cfg.feat_proj_dropout,
+        "encoder_pos_conv_kernel": cfg.num_conv_pos_embeddings,
+        "encoder_pos_conv_groups": cfg.num_conv_pos_embedding_groups,
+        "encoder_num_layers": cfg.num_hidden_layers,
+        "encoder_num_heads": cfg.num_attention_heads,
+        "encoder_num_buckets": cfg.num_buckets,
+        "encoder_max_distance": cfg.max_bucket_distance,
+        "encoder_attention_dropout": cfg.attention_dropout,
+        "encoder_ff_interm_features": cfg.intermediate_size,
+        "encoder_ff_interm_dropout": cfg.activation_dropout,
+        "encoder_dropout": cfg.hidden_dropout,
+        "encoder_layer_norm_first": cfg.do_stable_layer_norm,
+        "encoder_layer_drop": cfg.layerdrop,
+    }
+    return config
+
+
+def _build(config, original):
+    is_for_ctc = original.__class__.__name__ in ["Wav2Vec2ForCTC", "WavLMForCTC"]
+    if is_for_ctc:
+        aux_num_out = original.config.vocab_size
+        wav2vec2 = original.wav2vec2
+    else:
+        _LG.warning(
+            "The model is not an instance of Wav2Vec2ForCTC or WavLMForCTC. " '"lm_head" module is not imported.'
+        )
+        aux_num_out = None
+        wav2vec2 = original
+    is_wavlm = original.__class__.__name__ in ["WavLMModel", "WavLMForCTC"]
+    if is_wavlm:
+        imported = wavlm_model(**config, aux_num_out=aux_num_out)
+    else:
+        imported = wav2vec2_model(**config, aux_num_out=aux_num_out)
+    imported.feature_extractor.load_state_dict(wav2vec2.feature_extractor.state_dict())
+    imported.encoder.feature_projection.load_state_dict(wav2vec2.feature_projection.state_dict())
+    encoder_state_dict = wav2vec2.encoder.state_dict()
+    if is_wavlm:  # Rename paramaters of linear transformations for compatibility with the HF model
+        transform_wavlm_encoder_state(encoder_state_dict, config["encoder_num_layers"])
+    imported.encoder.transformer.load_state_dict(encoder_state_dict)
+    if is_for_ctc:
+        imported.aux.load_state_dict(original.lm_head.state_dict())
+    return imported
+
+
+def transform_wavlm_encoder_state(state: Dict[str, Any], encoder_num_layers: int):
+    """Converts WavLM encoder state from HuggingFace format. In particular, concatenates linear projection weights and
+    biases to align with the structure of ``torch.nn.MultiheadAttention``.
+    """
+    for i in range(encoder_num_layers):
+        q_proj_bias = state.pop(f"layers.{i}.attention.q_proj.bias")
+        k_proj_bias = state.pop(f"layers.{i}.attention.k_proj.bias")
+        v_proj_bias = state.pop(f"layers.{i}.attention.v_proj.bias")
+        q_proj_weight = state.pop(f"layers.{i}.attention.q_proj.weight")
+        k_proj_weight = state.pop(f"layers.{i}.attention.k_proj.weight")
+        v_proj_weight = state.pop(f"layers.{i}.attention.v_proj.weight")
+        state[f"layers.{i}.attention.attention.in_proj_bias"] = torch.cat((q_proj_bias, k_proj_bias, v_proj_bias))
+        state[f"layers.{i}.attention.attention.in_proj_weight"] = torch.cat(
+            (q_proj_weight, k_proj_weight, v_proj_weight)
+        )
+
+        state[f"layers.{i}.attention.attention.out_proj.weight"] = state.pop(f"layers.{i}.attention.out_proj.weight")
+        state[f"layers.{i}.attention.attention.out_proj.bias"] = state.pop(f"layers.{i}.attention.out_proj.bias")
+
+
+def import_huggingface_model(original: Module) -> Wav2Vec2Model:
+    """Builds :class:`Wav2Vec2Model` from the corresponding model object of
+    `Transformers <https://huggingface.co/transformers/>`_.
+
+    Args:
+        original (torch.nn.Module): An instance of ``Wav2Vec2ForCTC`` from ``transformers``.
+
+    Returns:
+        Wav2Vec2Model: Imported model.
+
+    Example
+        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
+        >>>
+        >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> model = import_huggingface_model(original)
+        >>>
+        >>> waveforms, _ = torchaudio.load("audio.wav")
+        >>> logits, _ = model(waveforms)
+    """
+    _LG.info("Importing model.")
+    _LG.info("Loading model configuration.")
+    is_wavlm = original.__class__.__name__ in ["WavLMModel", "WavLMForCTC"]
+    if is_wavlm:
+        config = _get_config_wavlm(original.config)
+    else:
+        config = _get_config(original.config)
+    _LG.debug("  - config: %s", config)
+    _LG.info("Building model.")
+    imported = _build(config, original)
+    return imported
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/wavlm_attention.py b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/wavlm_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fcff2a5679511c48675b894bc3f3efd501b6d0a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/wav2vec2/wavlm_attention.py
@@ -0,0 +1,214 @@
+"""
+The MIT License (MIT)
+
+Copyright (c) Microsoft Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import math
+from typing import Optional, Tuple
+
+import torch
+from torch import nn, Tensor
+
+
+class WavLMSelfAttention(nn.Module):
+    """Multi-headed self-attention for WavLM model :cite:`chen2022wavlm`.
+    Wraps around ``torch.nn.MultiheadAttention``, creating relaive position embeddings and passing them to multi-headed
+    attention as a mask.
+    Source: https://github.com/microsoft/unilm/blob/2d8302f09c99bca2b82e6e868d81d4281cceebc8/wavlm/modules.py#L303-L763
+
+    Args:
+        embed_dim (int): Total dimension of the model.
+        num_heads (int): The number of heads.
+        dropout (float, optional): Dropout probability on attn_output_weights. (Default: to ``0.0``)
+        bias (bool, optional): If ``True``, add bias to input / output projection layers. (Default: ``True``)
+        has_relative_attention_bias (bool, optional): If ``True``, apply relative position embedding.
+            Necessary in the first encoder layer, but not in the subsequent ones. (Default: ``False``)
+        num_buckets (int, optional): Number of buckets for relative position embedding. (Default: ``32``)
+        max_distance (int, optional): Naximum distance for relative position embedding. (Default: ``128``)
+        gru_rel_pos (bool, optional): If ``True``, apply gated relative position embedding. (Default: ``False``)
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+        has_relative_attention_bias: bool = False,
+        num_buckets: int = 32,
+        max_distance: int = 128,
+        gru_rel_pos: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+
+        if has_relative_attention_bias:
+            self.rel_attn_embed = nn.Embedding(num_buckets, num_heads)
+        else:
+            self.rel_attn_embed = None
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.dropout = dropout
+        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, bias=bias, batch_first=True)
+
+        self.gru_rel_pos = gru_rel_pos
+        if self.gru_rel_pos:
+            self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8)
+            self.gru_rel_pos_const = nn.Parameter(torch.ones(1, num_heads, 1, 1))
+        self.has_position_bias = True
+
+    def compute_bias(self, query_length: int, key_length: int) -> Tensor:
+        """Compute relative position embeddings for WavLM model.
+        Args:
+            query_length (int): Query position can take values between 0 and ``query_length - 1``.
+            key_length (int): Key position can take values between 0 and ``key_length - 1``.
+        Returns:
+            Tensor of shape `(num_heads, query_length, key_length)`, relative positions embeddings
+        """
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position  # Shape (query_length, key_length)
+        relative_position_bucket = self._relative_positions_bucket(relative_position, bidirectional=True)
+        relative_position_bucket = relative_position_bucket.to(self.rel_attn_embed.weight.device)
+        values = self.rel_attn_embed(relative_position_bucket)  # Shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1])
+        return values
+
+    def _relative_positions_bucket(self, relative_positions: Tensor, bidirectional: bool = True):
+        """Compute relative position buckets for WavLM model. Computation similar to formula (5) in WavLM
+           paper :cite:`chen2022wavlm`.
+        Args:
+            relative_positions (Tensor): Relative offsets between query and key positions,
+                of shape ``(query_length, key_length)``.
+            bidirectional (bool): If ``True``, values will be filled both above and below the diagonal in the resulting
+                matrix. If ``False``, the elements above the diagonal (i.e. with negative relative offsets) will be set
+                to zero. (Default ``True``)
+        Returns:
+            Tensor of shape ``(query_length, key_length)`` filled bucketed values of with relative positions.
+        """
+        num_buckets = self.num_buckets
+        max_distance = self.max_distance
+        # Shape (query_length, key_length)
+        relative_buckets = torch.zeros_like(relative_positions, dtype=torch.long)
+
+        if bidirectional:
+            num_buckets = num_buckets // 2
+            relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets
+            relative_positions = torch.abs(relative_positions)
+        else:
+            relative_positions = -torch.min(relative_positions, torch.zeros_like(relative_positions))
+
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+
+        relative_postion_if_large = max_exact + (
+            torch.log(relative_positions.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_positions, relative_postion_if_large)
+        return relative_buckets
+
+    def forward(
+        self,
+        query: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        position_bias: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            query (Tensor): Input of shape ``(batch_size, src_len, embed_dim)``.
+            key_padding_mask (Tensor or None, optional): Mask to exclude keys that are pads, of shape
+                `(batch, src_len)`, where padding elements are indicated by 1s. (Default: ``None``)
+            attn_mask: Needs to be ``None``. The argument exists for compatibility with
+                ``EncoderLayer``. (Default: ``None``)
+            position_bias (Tensor or None, optional): Position bias of shape
+                ``(batch_size * num_heads, src_len, src_len)``. When used inside WavLM model encoder, will be
+                generated in the first layer and then passed from each encoder layer to the next one.
+                (Default: ``None``)
+        Returns:
+            attn_output (Tensor): Attention output of shape ``(batch_size, src_len, embed_dim)``.
+            position_bias (Tensor or None): Position bias of shape ``(batch_size * num_heads, src_len, src_len)``.
+        """
+        bsz, seq_len, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert attention_mask is None
+
+        if self.rel_attn_embed is not None and position_bias is None:
+            position_bias = self.compute_bias(seq_len, seq_len)
+            position_bias = position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1)
+
+        attn_mask_rel_pos: Optional[Tensor] = None
+        if position_bias is not None:
+            attn_mask_rel_pos = position_bias
+            if self.gru_rel_pos:  # Apply gating on relative position bias
+                query_layer = query.view(bsz, seq_len, self.num_heads, -1)
+                query_layer = query_layer.permute(0, 2, 1, 3)
+
+                gate_a, gate_b = torch.sigmoid(
+                    self.gru_rel_pos_linear(query_layer).view(bsz, self.num_heads, seq_len, 2, 4).sum(-1, keepdim=False)
+                ).chunk(2, dim=-1)
+                gate_a_1 = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0
+                attn_mask_rel_pos = gate_a_1.view(bsz, self.num_heads, -1, 1) * position_bias
+
+            attn_mask_rel_pos = attn_mask_rel_pos.view((bsz, self.num_heads, seq_len, seq_len))
+
+        if attn_mask_rel_pos is not None and key_padding_mask is not None:
+            key_padding_mask = key_padding_mask.view(bsz, 1, 1, seq_len).expand(-1, self.num_heads, -1, -1)
+            key_padding_mask = torch.nn.functional._canonical_mask(
+                mask=key_padding_mask,
+                mask_name="key_padding_mask",
+                other_type=torch.nn.functional._none_or_dtype(attn_mask_rel_pos),
+                other_name="",
+                target_type=query.dtype,
+            )
+        if attn_mask_rel_pos is not None and key_padding_mask is not None:
+            attn_mask_rel_pos = attn_mask_rel_pos + key_padding_mask
+        query_projected = torch.nn.functional.linear(query, self.attention.in_proj_weight, self.attention.in_proj_bias)
+        query, key, value = query_projected.chunk(3, -1)
+        shape = (bsz, seq_len, self.num_heads, self.head_dim)
+        query = query.view(shape).transpose(2, 1)  # (batch, num_heads, seq_len, head_dim)
+        key = key.view(shape).transpose(2, 1)  # (batch, num_heads, seq_len, head_dim)
+        value = value.view(shape).transpose(2, 1)  # (batch, num_heads, seq_len, head_dim)
+        dropout = self.dropout if self.training else 0.0
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=attn_mask_rel_pos,
+            dropout_p=dropout,
+            is_causal=False,
+        )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, -1, self.num_heads * self.head_dim)
+        attn_output = self.attention.out_proj(attn_output)
+        return attn_output, position_bias
diff --git a/MLPY/Lib/site-packages/torchaudio/models/wavernn.py b/MLPY/Lib/site-packages/torchaudio/models/wavernn.py
new file mode 100644
index 0000000000000000000000000000000000000000..90bc2fca7240235e2a8e67ba454ba29a4a9e667b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/models/wavernn.py
@@ -0,0 +1,409 @@
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+__all__ = [
+    "ResBlock",
+    "MelResNet",
+    "Stretch2d",
+    "UpsampleNetwork",
+    "WaveRNN",
+]
+
+
+class ResBlock(nn.Module):
+    r"""ResNet block based on *Efficient Neural Audio Synthesis* :cite:`kalchbrenner2018efficient`.
+
+    Args:
+        n_freq: the number of bins in a spectrogram. (Default: ``128``)
+
+    Examples
+        >>> resblock = ResBlock()
+        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
+        >>> output = resblock(input)  # shape: (10, 128, 512)
+    """
+
+    def __init__(self, n_freq: int = 128) -> None:
+        super().__init__()
+
+        self.resblock_model = nn.Sequential(
+            nn.Conv1d(in_channels=n_freq, out_channels=n_freq, kernel_size=1, bias=False),
+            nn.BatchNorm1d(n_freq),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=n_freq, out_channels=n_freq, kernel_size=1, bias=False),
+            nn.BatchNorm1d(n_freq),
+        )
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the ResBlock layer.
+        Args:
+            specgram (Tensor): the input sequence to the ResBlock layer (n_batch, n_freq, n_time).
+
+        Return:
+            Tensor shape: (n_batch, n_freq, n_time)
+        """
+
+        return self.resblock_model(specgram) + specgram
+
+
+class MelResNet(nn.Module):
+    r"""MelResNet layer uses a stack of ResBlocks on spectrogram.
+
+    Args:
+        n_res_block: the number of ResBlock in stack. (Default: ``10``)
+        n_freq: the number of bins in a spectrogram. (Default: ``128``)
+        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
+        n_output: the number of output dimensions of melresnet. (Default: ``128``)
+        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
+
+    Examples
+        >>> melresnet = MelResNet()
+        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
+        >>> output = melresnet(input)  # shape: (10, 128, 508)
+    """
+
+    def __init__(
+        self, n_res_block: int = 10, n_freq: int = 128, n_hidden: int = 128, n_output: int = 128, kernel_size: int = 5
+    ) -> None:
+        super().__init__()
+
+        ResBlocks = [ResBlock(n_hidden) for _ in range(n_res_block)]
+
+        self.melresnet_model = nn.Sequential(
+            nn.Conv1d(in_channels=n_freq, out_channels=n_hidden, kernel_size=kernel_size, bias=False),
+            nn.BatchNorm1d(n_hidden),
+            nn.ReLU(inplace=True),
+            *ResBlocks,
+            nn.Conv1d(in_channels=n_hidden, out_channels=n_output, kernel_size=1),
+        )
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the MelResNet layer.
+        Args:
+            specgram (Tensor): the input sequence to the MelResNet layer (n_batch, n_freq, n_time).
+
+        Return:
+            Tensor shape: (n_batch, n_output, n_time - kernel_size + 1)
+        """
+
+        return self.melresnet_model(specgram)
+
+
+class Stretch2d(nn.Module):
+    r"""Upscale the frequency and time dimensions of a spectrogram.
+
+    Args:
+        time_scale: the scale factor in time dimension
+        freq_scale: the scale factor in frequency dimension
+
+    Examples
+        >>> stretch2d = Stretch2d(time_scale=10, freq_scale=5)
+
+        >>> input = torch.rand(10, 100, 512)  # a random spectrogram
+        >>> output = stretch2d(input)  # shape: (10, 500, 5120)
+    """
+
+    def __init__(self, time_scale: int, freq_scale: int) -> None:
+        super().__init__()
+
+        self.freq_scale = freq_scale
+        self.time_scale = time_scale
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the Stretch2d layer.
+
+        Args:
+            specgram (Tensor): the input sequence to the Stretch2d layer (..., n_freq, n_time).
+
+        Return:
+            Tensor shape: (..., n_freq * freq_scale, n_time * time_scale)
+        """
+
+        return specgram.repeat_interleave(self.freq_scale, -2).repeat_interleave(self.time_scale, -1)
+
+
+class UpsampleNetwork(nn.Module):
+    r"""Upscale the dimensions of a spectrogram.
+
+    Args:
+        upsample_scales: the list of upsample scales.
+        n_res_block: the number of ResBlock in stack. (Default: ``10``)
+        n_freq: the number of bins in a spectrogram. (Default: ``128``)
+        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
+        n_output: the number of output dimensions of melresnet. (Default: ``128``)
+        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
+
+    Examples
+        >>> upsamplenetwork = UpsampleNetwork(upsample_scales=[4, 4, 16])
+        >>> input = torch.rand(10, 128, 10)  # a random spectrogram
+        >>> output = upsamplenetwork(input)  # shape: (10, 128, 1536), (10, 128, 1536)
+    """
+
+    def __init__(
+        self,
+        upsample_scales: List[int],
+        n_res_block: int = 10,
+        n_freq: int = 128,
+        n_hidden: int = 128,
+        n_output: int = 128,
+        kernel_size: int = 5,
+    ) -> None:
+        super().__init__()
+
+        total_scale = 1
+        for upsample_scale in upsample_scales:
+            total_scale *= upsample_scale
+        self.total_scale: int = total_scale
+
+        self.indent = (kernel_size - 1) // 2 * total_scale
+        self.resnet = MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size)
+        self.resnet_stretch = Stretch2d(total_scale, 1)
+
+        up_layers = []
+        for scale in upsample_scales:
+            stretch = Stretch2d(scale, 1)
+            conv = nn.Conv2d(
+                in_channels=1, out_channels=1, kernel_size=(1, scale * 2 + 1), padding=(0, scale), bias=False
+            )
+            torch.nn.init.constant_(conv.weight, 1.0 / (scale * 2 + 1))
+            up_layers.append(stretch)
+            up_layers.append(conv)
+        self.upsample_layers = nn.Sequential(*up_layers)
+
+    def forward(self, specgram: Tensor) -> Tuple[Tensor, Tensor]:
+        r"""Pass the input through the UpsampleNetwork layer.
+
+        Args:
+            specgram (Tensor): the input sequence to the UpsampleNetwork layer (n_batch, n_freq, n_time)
+
+        Return:
+            Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale),
+                          (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
+        where total_scale is the product of all elements in upsample_scales.
+        """
+
+        resnet_output = self.resnet(specgram).unsqueeze(1)
+        resnet_output = self.resnet_stretch(resnet_output)
+        resnet_output = resnet_output.squeeze(1)
+
+        specgram = specgram.unsqueeze(1)
+        upsampling_output = self.upsample_layers(specgram)
+        upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent : -self.indent]
+
+        return upsampling_output, resnet_output
+
+
+class WaveRNN(nn.Module):
+    r"""WaveRNN model from *Efficient Neural Audio Synthesis* :cite:`wavernn`
+    based on the implementation from `fatchord/WaveRNN <https://github.com/fatchord/WaveRNN>`_.
+
+    The original implementation was introduced in *Efficient Neural Audio Synthesis*
+    :cite:`kalchbrenner2018efficient`. The input channels of waveform and spectrogram have to be 1.
+    The product of `upsample_scales` must equal `hop_length`.
+
+    See Also:
+        * `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wavernn>`__
+        * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.
+
+    Args:
+        upsample_scales: the list of upsample scales.
+        n_classes: the number of output classes.
+        hop_length: the number of samples between the starts of consecutive frames.
+        n_res_block: the number of ResBlock in stack. (Default: ``10``)
+        n_rnn: the dimension of RNN layer. (Default: ``512``)
+        n_fc: the dimension of fully connected layer. (Default: ``512``)
+        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
+        n_freq: the number of bins in a spectrogram. (Default: ``128``)
+        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
+        n_output: the number of output dimensions of melresnet. (Default: ``128``)
+
+    Example
+        >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200)
+        >>> waveform, sample_rate = torchaudio.load(file)
+        >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
+        >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
+        >>> output = wavernn(waveform, specgram)
+        >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
+    """
+
+    def __init__(
+        self,
+        upsample_scales: List[int],
+        n_classes: int,
+        hop_length: int,
+        n_res_block: int = 10,
+        n_rnn: int = 512,
+        n_fc: int = 512,
+        kernel_size: int = 5,
+        n_freq: int = 128,
+        n_hidden: int = 128,
+        n_output: int = 128,
+    ) -> None:
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self._pad = (kernel_size - 1 if kernel_size % 2 else kernel_size) // 2
+        self.n_rnn = n_rnn
+        self.n_aux = n_output // 4
+        self.hop_length = hop_length
+        self.n_classes = n_classes
+        self.n_bits: int = int(math.log2(self.n_classes))
+
+        total_scale = 1
+        for upsample_scale in upsample_scales:
+            total_scale *= upsample_scale
+        if total_scale != self.hop_length:
+            raise ValueError(f"Expected: total_scale == hop_length, but found {total_scale} != {hop_length}")
+
+        self.upsample = UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size)
+        self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn)
+
+        self.rnn1 = nn.GRU(n_rnn, n_rnn, batch_first=True)
+        self.rnn2 = nn.GRU(n_rnn + self.n_aux, n_rnn, batch_first=True)
+
+        self.relu1 = nn.ReLU(inplace=True)
+        self.relu2 = nn.ReLU(inplace=True)
+
+        self.fc1 = nn.Linear(n_rnn + self.n_aux, n_fc)
+        self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc)
+        self.fc3 = nn.Linear(n_fc, self.n_classes)
+
+    def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the WaveRNN model.
+
+        Args:
+            waveform: the input waveform to the WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
+            specgram: the input spectrogram to the WaveRNN layer (n_batch, 1, n_freq, n_time)
+
+        Return:
+            Tensor: shape (n_batch, 1, (n_time - kernel_size + 1) * hop_length, n_classes)
+        """
+
+        if waveform.size(1) != 1:
+            raise ValueError("Require the input channel of waveform is 1")
+        if specgram.size(1) != 1:
+            raise ValueError("Require the input channel of specgram is 1")
+        # remove channel dimension until the end
+        waveform, specgram = waveform.squeeze(1), specgram.squeeze(1)
+
+        batch_size = waveform.size(0)
+        h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
+        h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
+        # output of upsample:
+        # specgram: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale)
+        # aux: (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
+        specgram, aux = self.upsample(specgram)
+        specgram = specgram.transpose(1, 2)
+        aux = aux.transpose(1, 2)
+
+        aux_idx = [self.n_aux * i for i in range(5)]
+        a1 = aux[:, :, aux_idx[0] : aux_idx[1]]
+        a2 = aux[:, :, aux_idx[1] : aux_idx[2]]
+        a3 = aux[:, :, aux_idx[2] : aux_idx[3]]
+        a4 = aux[:, :, aux_idx[3] : aux_idx[4]]
+
+        x = torch.cat([waveform.unsqueeze(-1), specgram, a1], dim=-1)
+        x = self.fc(x)
+        res = x
+        x, _ = self.rnn1(x, h1)
+
+        x = x + res
+        res = x
+        x = torch.cat([x, a2], dim=-1)
+        x, _ = self.rnn2(x, h2)
+
+        x = x + res
+        x = torch.cat([x, a3], dim=-1)
+        x = self.fc1(x)
+        x = self.relu1(x)
+
+        x = torch.cat([x, a4], dim=-1)
+        x = self.fc2(x)
+        x = self.relu2(x)
+        x = self.fc3(x)
+
+        # bring back channel dimension
+        return x.unsqueeze(1)
+
+    @torch.jit.export
+    def infer(self, specgram: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        r"""Inference method of WaveRNN.
+
+        This function currently only supports multinomial sampling, which assumes the
+        network is trained on cross entropy loss.
+
+        Args:
+            specgram (Tensor):
+                Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
+            lengths (Tensor or None, optional):
+                Indicates the valid length of each audio in the batch.
+                Shape: `(batch, )`.
+                When the ``specgram`` contains spectrograms with different durations,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths.
+                If ``None``, it is assumed that all the audio in ``waveforms``
+                have valid length. Default: ``None``.
+
+        Returns:
+            (Tensor, Optional[Tensor]):
+            Tensor
+                The inferred waveform of size `(n_batch, 1, n_time)`.
+                1 stands for a single channel.
+            Tensor or None
+                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
+                is returned.
+                It indicates the valid length in time axis of the output Tensor.
+        """
+
+        device = specgram.device
+        dtype = specgram.dtype
+
+        specgram = torch.nn.functional.pad(specgram, (self._pad, self._pad))
+        specgram, aux = self.upsample(specgram)
+        if lengths is not None:
+            lengths = lengths * self.upsample.total_scale
+
+        output: List[Tensor] = []
+        b_size, _, seq_len = specgram.size()
+
+        h1 = torch.zeros((1, b_size, self.n_rnn), device=device, dtype=dtype)
+        h2 = torch.zeros((1, b_size, self.n_rnn), device=device, dtype=dtype)
+        x = torch.zeros((b_size, 1), device=device, dtype=dtype)
+
+        aux_split = [aux[:, self.n_aux * i : self.n_aux * (i + 1), :] for i in range(4)]
+
+        for i in range(seq_len):
+
+            m_t = specgram[:, :, i]
+
+            a1_t, a2_t, a3_t, a4_t = [a[:, :, i] for a in aux_split]
+
+            x = torch.cat([x, m_t, a1_t], dim=1)
+            x = self.fc(x)
+            _, h1 = self.rnn1(x.unsqueeze(1), h1)
+
+            x = x + h1[0]
+            inp = torch.cat([x, a2_t], dim=1)
+            _, h2 = self.rnn2(inp.unsqueeze(1), h2)
+
+            x = x + h2[0]
+            x = torch.cat([x, a3_t], dim=1)
+            x = F.relu(self.fc1(x))
+
+            x = torch.cat([x, a4_t], dim=1)
+            x = F.relu(self.fc2(x))
+
+            logits = self.fc3(x)
+
+            posterior = F.softmax(logits, dim=1)
+
+            x = torch.multinomial(posterior, 1).float()
+            # Transform label [0, 2 ** n_bits - 1] to waveform [-1, 1]
+            x = 2 * x / (2**self.n_bits - 1.0) - 1.0
+
+            output.append(x)
+
+        return torch.stack(output).permute(1, 2, 0), lengths
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/__init__.py b/MLPY/Lib/site-packages/torchaudio/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cad3f14dfb5b27839d0954959428732acedb8a2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/__init__.py
@@ -0,0 +1,102 @@
+from ._source_separation_pipeline import (
+    CONVTASNET_BASE_LIBRI2MIX,
+    HDEMUCS_HIGH_MUSDB,
+    HDEMUCS_HIGH_MUSDB_PLUS,
+    SourceSeparationBundle,
+)
+from ._squim_pipeline import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE, SquimObjectiveBundle, SquimSubjectiveBundle
+from ._tts import (
+    TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
+    TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
+    TACOTRON2_WAVERNN_CHAR_LJSPEECH,
+    TACOTRON2_WAVERNN_PHONE_LJSPEECH,
+    Tacotron2TTSBundle,
+)
+from ._wav2vec2.impl import (
+    HUBERT_ASR_LARGE,
+    HUBERT_ASR_XLARGE,
+    HUBERT_BASE,
+    HUBERT_LARGE,
+    HUBERT_XLARGE,
+    MMS_FA,
+    VOXPOPULI_ASR_BASE_10K_DE,
+    VOXPOPULI_ASR_BASE_10K_EN,
+    VOXPOPULI_ASR_BASE_10K_ES,
+    VOXPOPULI_ASR_BASE_10K_FR,
+    VOXPOPULI_ASR_BASE_10K_IT,
+    WAV2VEC2_ASR_BASE_100H,
+    WAV2VEC2_ASR_BASE_10M,
+    WAV2VEC2_ASR_BASE_960H,
+    WAV2VEC2_ASR_LARGE_100H,
+    WAV2VEC2_ASR_LARGE_10M,
+    WAV2VEC2_ASR_LARGE_960H,
+    WAV2VEC2_ASR_LARGE_LV60K_100H,
+    WAV2VEC2_ASR_LARGE_LV60K_10M,
+    WAV2VEC2_ASR_LARGE_LV60K_960H,
+    WAV2VEC2_BASE,
+    WAV2VEC2_LARGE,
+    WAV2VEC2_LARGE_LV60K,
+    WAV2VEC2_XLSR53,
+    WAV2VEC2_XLSR_1B,
+    WAV2VEC2_XLSR_2B,
+    WAV2VEC2_XLSR_300M,
+    Wav2Vec2ASRBundle,
+    Wav2Vec2Bundle,
+    Wav2Vec2FABundle,
+    WAVLM_BASE,
+    WAVLM_BASE_PLUS,
+    WAVLM_LARGE,
+)
+from .rnnt_pipeline import EMFORMER_RNNT_BASE_LIBRISPEECH, RNNTBundle
+
+
+__all__ = [
+    "Wav2Vec2Bundle",
+    "Wav2Vec2ASRBundle",
+    "Wav2Vec2FABundle",
+    "WAV2VEC2_BASE",
+    "WAV2VEC2_LARGE",
+    "WAV2VEC2_LARGE_LV60K",
+    "WAV2VEC2_ASR_BASE_10M",
+    "WAV2VEC2_ASR_BASE_100H",
+    "WAV2VEC2_ASR_BASE_960H",
+    "WAV2VEC2_ASR_LARGE_10M",
+    "WAV2VEC2_ASR_LARGE_100H",
+    "WAV2VEC2_ASR_LARGE_960H",
+    "WAV2VEC2_ASR_LARGE_LV60K_10M",
+    "WAV2VEC2_ASR_LARGE_LV60K_100H",
+    "WAV2VEC2_ASR_LARGE_LV60K_960H",
+    "WAV2VEC2_XLSR53",
+    "WAV2VEC2_XLSR_300M",
+    "WAV2VEC2_XLSR_1B",
+    "WAV2VEC2_XLSR_2B",
+    "VOXPOPULI_ASR_BASE_10K_EN",
+    "VOXPOPULI_ASR_BASE_10K_ES",
+    "VOXPOPULI_ASR_BASE_10K_DE",
+    "VOXPOPULI_ASR_BASE_10K_FR",
+    "VOXPOPULI_ASR_BASE_10K_IT",
+    "HUBERT_BASE",
+    "HUBERT_LARGE",
+    "HUBERT_XLARGE",
+    "HUBERT_ASR_LARGE",
+    "HUBERT_ASR_XLARGE",
+    "MMS_FA",
+    "WAVLM_BASE",
+    "WAVLM_BASE_PLUS",
+    "WAVLM_LARGE",
+    "Tacotron2TTSBundle",
+    "TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH",
+    "TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH",
+    "TACOTRON2_WAVERNN_CHAR_LJSPEECH",
+    "TACOTRON2_WAVERNN_PHONE_LJSPEECH",
+    "RNNTBundle",
+    "EMFORMER_RNNT_BASE_LIBRISPEECH",
+    "SourceSeparationBundle",
+    "CONVTASNET_BASE_LIBRI2MIX",
+    "HDEMUCS_HIGH_MUSDB_PLUS",
+    "HDEMUCS_HIGH_MUSDB",
+    "SQUIM_OBJECTIVE",
+    "SQUIM_SUBJECTIVE",
+    "SquimObjectiveBundle",
+    "SquimSubjectiveBundle",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6426d3923bda9fd1f427e3aba4cc112437804252
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/_source_separation_pipeline.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/_source_separation_pipeline.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..717d4f7fd5e84f2f3b69dae81aeafc675c9ce855
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/_source_separation_pipeline.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/_squim_pipeline.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/_squim_pipeline.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3315c06d4f3a73de18a707f14f5941caa4fa521
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/_squim_pipeline.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/rnnt_pipeline.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/rnnt_pipeline.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48ebcda54b4b997cca261400ffbc269ec9050132
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/__pycache__/rnnt_pipeline.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_source_separation_pipeline.py b/MLPY/Lib/site-packages/torchaudio/pipelines/_source_separation_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..eebf190bd2233bd65143ba4b8b0da0ba5f1c6eba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/_source_separation_pipeline.py
@@ -0,0 +1,109 @@
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable
+
+import torch
+import torchaudio
+
+from torchaudio.models import conv_tasnet_base, hdemucs_high
+
+
+@dataclass
+class SourceSeparationBundle:
+    """Dataclass that bundles components for performing source separation.
+
+    Example
+        >>> import torchaudio
+        >>> from torchaudio.pipelines import CONVTASNET_BASE_LIBRI2MIX
+        >>> import torch
+        >>>
+        >>> # Build the separation model.
+        >>> model = CONVTASNET_BASE_LIBRI2MIX.get_model()
+        >>> 100%|███████████████████████████████|19.1M/19.1M [00:04<00:00, 4.93MB/s]
+        >>>
+        >>> # Instantiate the test set of Libri2Mix dataset.
+        >>> dataset = torchaudio.datasets.LibriMix("/home/datasets/", subset="test")
+        >>>
+        >>> # Apply source separation on mixture audio.
+        >>> for i, data in enumerate(dataset):
+        >>>     sample_rate, mixture, clean_sources = data
+        >>>     # Make sure the shape of input suits the model requirement.
+        >>>     mixture = mixture.reshape(1, 1, -1)
+        >>>     estimated_sources = model(mixture)
+        >>>     score = si_snr_pit(estimated_sources, clean_sources) # for demonstration
+        >>>     print(f"Si-SNR score is : {score}.)
+        >>>     break
+        >>> Si-SNR score is : 16.24.
+        >>>
+    """
+
+    _model_path: str
+    _model_factory_func: Callable[[], torch.nn.Module]
+    _sample_rate: int
+
+    @property
+    def sample_rate(self) -> int:
+        """Sample rate of the audio that the model is trained on.
+
+        :type: int
+        """
+        return self._sample_rate
+
+    def get_model(self) -> torch.nn.Module:
+        """Construct the model and load the pretrained weight."""
+        model = self._model_factory_func()
+        path = torchaudio.utils.download_asset(self._model_path)
+        state_dict = torch.load(path)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+
+CONVTASNET_BASE_LIBRI2MIX = SourceSeparationBundle(
+    _model_path="models/conv_tasnet_base_libri2mix.pt",
+    _model_factory_func=partial(conv_tasnet_base, num_sources=2),
+    _sample_rate=8000,
+)
+CONVTASNET_BASE_LIBRI2MIX.__doc__ = """Pre-trained Source Separation pipeline with *ConvTasNet*
+:cite:`Luo_2019` trained on *Libri2Mix dataset* :cite:`cosentino2020librimix`.
+
+The source separation model is constructed by :func:`~torchaudio.models.conv_tasnet_base`
+and is trained using the training script ``lightning_train.py``
+`here <https://github.com/pytorch/audio/tree/release/0.12/examples/source_separation/>`__
+with default arguments.
+
+Please refer to :class:`SourceSeparationBundle` for usage instructions.
+"""
+
+
+HDEMUCS_HIGH_MUSDB_PLUS = SourceSeparationBundle(
+    _model_path="models/hdemucs_high_trained.pt",
+    _model_factory_func=partial(hdemucs_high, sources=["drums", "bass", "other", "vocals"]),
+    _sample_rate=44100,
+)
+HDEMUCS_HIGH_MUSDB_PLUS.__doc__ = """Pre-trained music source separation pipeline with
+*Hybrid Demucs* :cite:`defossez2021hybrid` trained on both training and test sets of
+MUSDB-HQ :cite:`MUSDB18HQ` and an additional 150 extra songs from an internal database
+that was specifically produced for Meta.
+
+The model is constructed by :func:`~torchaudio.models.hdemucs_high`.
+
+Training was performed in the original HDemucs repository `here <https://github.com/facebookresearch/demucs/>`__.
+
+Please refer to :class:`SourceSeparationBundle` for usage instructions.
+"""
+
+
+HDEMUCS_HIGH_MUSDB = SourceSeparationBundle(
+    _model_path="models/hdemucs_high_musdbhq_only.pt",
+    _model_factory_func=partial(hdemucs_high, sources=["drums", "bass", "other", "vocals"]),
+    _sample_rate=44100,
+)
+HDEMUCS_HIGH_MUSDB.__doc__ = """Pre-trained music source separation pipeline with
+*Hybrid Demucs* :cite:`defossez2021hybrid` trained on the training set of MUSDB-HQ :cite:`MUSDB18HQ`.
+
+The model is constructed by :func:`~torchaudio.models.hdemucs_high`.
+Training was performed in the original HDemucs repository `here <https://github.com/facebookresearch/demucs/>`__.
+
+Please refer to :class:`SourceSeparationBundle` for usage instructions.
+"""
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_squim_pipeline.py b/MLPY/Lib/site-packages/torchaudio/pipelines/_squim_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..a80731cd3ba5488a785fd99f9cbe5025b63b046a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/_squim_pipeline.py
@@ -0,0 +1,176 @@
+from dataclasses import dataclass
+
+from torchaudio._internal import load_state_dict_from_url
+
+from torchaudio.models import squim_objective_base, squim_subjective_base, SquimObjective, SquimSubjective
+
+
+@dataclass
+class SquimObjectiveBundle:
+    """Data class that bundles associated information to use pretrained
+    :py:class:`~torchaudio.models.SquimObjective` model.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    This bundle can estimate objective metric scores for speech enhancement, such as STOI, PESQ, Si-SDR.
+    A typical use case would be a flow like `waveform -> list of scores`. Please see below for the code example.
+
+    Example: Estimate the objective metric scores for the input waveform.
+        >>> import torch
+        >>> import torchaudio
+        >>> from torchaudio.pipelines import SQUIM_OBJECTIVE as bundle
+        >>>
+        >>> # Load the SquimObjective bundle
+        >>> model = bundle.get_model()
+        Downloading: "https://download.pytorch.org/torchaudio/models/squim_objective_dns2020.pth"
+        100%|████████████| 28.2M/28.2M [00:03<00:00, 9.24MB/s]
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Estimate objective metric scores
+        >>> scores = model(waveform)
+        >>> print(f"STOI: {scores[0].item()},  PESQ: {scores[1].item()}, SI-SDR: {scores[2].item()}.")
+    """  # noqa: E501
+
+    _path: str
+    _sample_rate: float
+
+    def _get_state_dict(self, dl_kwargs):
+        url = f"https://download.pytorch.org/torchaudio/models/{self._path}"
+        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+        state_dict = load_state_dict_from_url(url, **dl_kwargs)
+        return state_dict
+
+    def get_model(self, *, dl_kwargs=None) -> SquimObjective:
+        """Construct the SquimObjective model, and load the pretrained weight.
+
+        The weight file is downloaded from the internet and cached with
+        :func:`torch.hub.load_state_dict_from_url`
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Variation of :py:class:`~torchaudio.models.SquimObjective`.
+        """
+        model = squim_objective_base()
+        model.load_state_dict(self._get_state_dict(dl_kwargs))
+        model.eval()
+        return model
+
+    @property
+    def sample_rate(self):
+        """Sample rate of the audio that the model is trained on.
+
+        :type: float
+        """
+        return self._sample_rate
+
+
+SQUIM_OBJECTIVE = SquimObjectiveBundle(
+    "squim_objective_dns2020.pth",
+    _sample_rate=16000,
+)
+SQUIM_OBJECTIVE.__doc__ = """SquimObjective pipeline trained using approach described in
+    :cite:`kumar2023torchaudio` on the *DNS 2020 Dataset* :cite:`reddy2020interspeech`.
+
+    The underlying model is constructed by :py:func:`torchaudio.models.squim_objective_base`.
+    The weights are under `Creative Commons Attribution 4.0 International License
+    <https://github.com/microsoft/DNS-Challenge/blob/interspeech2020/master/LICENSE>`__.
+
+    Please refer to :py:class:`SquimObjectiveBundle` for usage instructions.
+    """
+
+
+@dataclass
+class SquimSubjectiveBundle:
+    """Data class that bundles associated information to use pretrained
+    :py:class:`~torchaudio.models.SquimSubjective` model.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    This bundle can estimate subjective metric scores for speech enhancement, such as MOS.
+    A typical use case would be a flow like `waveform -> score`. Please see below for the code example.
+
+    Example: Estimate the subjective metric scores for the input waveform.
+        >>> import torch
+        >>> import torchaudio
+        >>> from torchaudio.pipelines import SQUIM_SUBJECTIVE as bundle
+        >>>
+        >>> # Load the SquimSubjective bundle
+        >>> model = bundle.get_model()
+        Downloading: "https://download.pytorch.org/torchaudio/models/squim_subjective_bvcc_daps.pth"
+        100%|████████████| 360M/360M [00:09<00:00, 41.1MB/s]
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>> # Use a clean reference (doesn't need to be the reference for the waveform) as the second input
+        >>> reference = torchaudio.functional.resample(reference, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Estimate subjective metric scores
+        >>> score = model(waveform, reference)
+        >>> print(f"MOS: {score}.")
+    """  # noqa: E501
+
+    _path: str
+    _sample_rate: float
+
+    def _get_state_dict(self, dl_kwargs):
+        url = f"https://download.pytorch.org/torchaudio/models/{self._path}"
+        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+        state_dict = load_state_dict_from_url(url, **dl_kwargs)
+        return state_dict
+
+    def get_model(self, *, dl_kwargs=None) -> SquimSubjective:
+        """Construct the SquimSubjective model, and load the pretrained weight.
+
+        The weight file is downloaded from the internet and cached with
+        :func:`torch.hub.load_state_dict_from_url`
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Variation of :py:class:`~torchaudio.models.SquimObjective`.
+        """
+        model = squim_subjective_base()
+        model.load_state_dict(self._get_state_dict(dl_kwargs))
+        model.eval()
+        return model
+
+    @property
+    def sample_rate(self):
+        """Sample rate of the audio that the model is trained on.
+
+        :type: float
+        """
+        return self._sample_rate
+
+
+SQUIM_SUBJECTIVE = SquimSubjectiveBundle(
+    "squim_subjective_bvcc_daps.pth",
+    _sample_rate=16000,
+)
+SQUIM_SUBJECTIVE.__doc__ = """SquimSubjective pipeline trained
+    as described in :cite:`manocha2022speech` and :cite:`kumar2023torchaudio`
+    on the *BVCC* :cite:`cooper2021voices` and *DAPS* :cite:`mysore2014can` datasets.
+
+    The underlying model is constructed by :py:func:`torchaudio.models.squim_subjective_base`.
+    The weights are under `Creative Commons Attribution Non Commercial 4.0 International
+    <https://zenodo.org/record/4660670#.ZBtWPOxuerN>`__.
+
+    Please refer to :py:class:`SquimSubjectiveBundle` for usage instructions.
+    """
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__init__.py b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..488d121d458f65454bab2719f873c10262e1aac9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__init__.py
@@ -0,0 +1,16 @@
+from .impl import (
+    TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
+    TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
+    TACOTRON2_WAVERNN_CHAR_LJSPEECH,
+    TACOTRON2_WAVERNN_PHONE_LJSPEECH,
+)
+from .interface import Tacotron2TTSBundle
+
+
+__all__ = [
+    "Tacotron2TTSBundle",
+    "TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH",
+    "TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH",
+    "TACOTRON2_WAVERNN_CHAR_LJSPEECH",
+    "TACOTRON2_WAVERNN_PHONE_LJSPEECH",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7265bc9ec526d96728d1e2e7bb1baf6a39ab010
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/impl.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42ac011d3e1832483af4c625d73f606ee7d21669
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/interface.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/interface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5e13047828bb790aba22f0ab9bbe15c03d40158
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/interface.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00d2bd3f605e705cbf84b0de6c306dca769688d5
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/impl.py b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b8ac89c4a940128e406a743d023b53835645c95
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/impl.py
@@ -0,0 +1,385 @@
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torchaudio._internal import load_state_dict_from_url
+from torchaudio.functional import mu_law_decoding
+from torchaudio.models import Tacotron2, WaveRNN
+from torchaudio.transforms import GriffinLim, InverseMelScale
+
+from . import utils
+from .interface import Tacotron2TTSBundle
+
+__all__ = []
+
+_BASE_URL = "https://download.pytorch.org/torchaudio/models"
+
+
+################################################################################
+# Pipeline implementation - Text Processor
+################################################################################
+
+
+class _EnglishCharProcessor(Tacotron2TTSBundle.TextProcessor):
+    def __init__(self):
+        super().__init__()
+        self._tokens = utils._get_chars()
+        self._mapping = {s: i for i, s in enumerate(self._tokens)}
+
+    @property
+    def tokens(self):
+        return self._tokens
+
+    def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
+        if isinstance(texts, str):
+            texts = [texts]
+        indices = [[self._mapping[c] for c in t.lower() if c in self._mapping] for t in texts]
+        return utils._to_tensor(indices)
+
+
+class _EnglishPhoneProcessor(Tacotron2TTSBundle.TextProcessor):
+    def __init__(self, *, dl_kwargs=None):
+        super().__init__()
+        self._tokens = utils._get_phones()
+        self._mapping = {p: i for i, p in enumerate(self._tokens)}
+        self._phonemizer = utils._load_phonemizer("en_us_cmudict_forward.pt", dl_kwargs=dl_kwargs)
+        self._pattern = r"(\[[A-Z]+?\]|[_!'(),.:;? -])"
+
+    @property
+    def tokens(self):
+        return self._tokens
+
+    def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
+        if isinstance(texts, str):
+            texts = [texts]
+
+        indices = []
+        for phones in self._phonemizer(texts, lang="en_us"):
+            # '[F][UW][B][AA][R]!' -> ['F', 'UW', 'B', 'AA', 'R', '!']
+            ret = [re.sub(r"[\[\]]", "", r) for r in re.findall(self._pattern, phones)]
+            indices.append([self._mapping[p] for p in ret])
+        return utils._to_tensor(indices)
+
+
+################################################################################
+# Pipeline implementation - Vocoder
+################################################################################
+
+
+class _WaveRNNVocoder(torch.nn.Module, Tacotron2TTSBundle.Vocoder):
+    def __init__(self, model: WaveRNN, min_level_db: Optional[float] = -100):
+        super().__init__()
+        self._sample_rate = 22050
+        self._model = model
+        self._min_level_db = min_level_db
+
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+
+    def forward(self, mel_spec, lengths=None):
+        mel_spec = torch.exp(mel_spec)
+        mel_spec = 20 * torch.log10(torch.clamp(mel_spec, min=1e-5))
+        if self._min_level_db is not None:
+            mel_spec = (self._min_level_db - mel_spec) / self._min_level_db
+            mel_spec = torch.clamp(mel_spec, min=0, max=1)
+        waveform, lengths = self._model.infer(mel_spec, lengths)
+        waveform = utils._unnormalize_waveform(waveform, self._model.n_bits)
+        waveform = mu_law_decoding(waveform, self._model.n_classes)
+        waveform = waveform.squeeze(1)
+        return waveform, lengths
+
+
+class _GriffinLimVocoder(torch.nn.Module, Tacotron2TTSBundle.Vocoder):
+    def __init__(self):
+        super().__init__()
+        self._sample_rate = 22050
+        self._inv_mel = InverseMelScale(
+            n_stft=(1024 // 2 + 1),
+            n_mels=80,
+            sample_rate=self.sample_rate,
+            f_min=0.0,
+            f_max=8000.0,
+            mel_scale="slaney",
+            norm="slaney",
+        )
+        self._griffin_lim = GriffinLim(
+            n_fft=1024,
+            power=1,
+            hop_length=256,
+            win_length=1024,
+        )
+
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+
+    def forward(self, mel_spec, lengths=None):
+        mel_spec = torch.exp(mel_spec)
+        mel_spec = mel_spec.clone().detach().requires_grad_(True)
+        spec = self._inv_mel(mel_spec)
+        spec = spec.detach().requires_grad_(False)
+        waveforms = self._griffin_lim(spec)
+        return waveforms, lengths
+
+
+################################################################################
+# Bundle classes mixins
+################################################################################
+
+
+class _CharMixin:
+    def get_text_processor(self) -> Tacotron2TTSBundle.TextProcessor:
+        return _EnglishCharProcessor()
+
+
+class _PhoneMixin:
+    def get_text_processor(self, *, dl_kwargs=None) -> Tacotron2TTSBundle.TextProcessor:
+        return _EnglishPhoneProcessor(dl_kwargs=dl_kwargs)
+
+
+@dataclass
+class _Tacotron2Mixin:
+    _tacotron2_path: str
+    _tacotron2_params: Dict[str, Any]
+
+    def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2:
+        model = Tacotron2(**self._tacotron2_params)
+        url = f"{_BASE_URL}/{self._tacotron2_path}"
+        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+        state_dict = load_state_dict_from_url(url, **dl_kwargs)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+
+@dataclass
+class _WaveRNNMixin:
+    _wavernn_path: Optional[str]
+    _wavernn_params: Optional[Dict[str, Any]]
+
+    def get_vocoder(self, *, dl_kwargs=None):
+        wavernn = self._get_wavernn(dl_kwargs=dl_kwargs)
+        return _WaveRNNVocoder(wavernn)
+
+    def _get_wavernn(self, *, dl_kwargs=None):
+        model = WaveRNN(**self._wavernn_params)
+        url = f"{_BASE_URL}/{self._wavernn_path}"
+        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+        state_dict = load_state_dict_from_url(url, **dl_kwargs)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+
+class _GriffinLimMixin:
+    def get_vocoder(self, **_):
+        return _GriffinLimVocoder()
+
+
+################################################################################
+# Bundle classes
+################################################################################
+
+
+@dataclass
+class _Tacotron2WaveRNNCharBundle(_WaveRNNMixin, _Tacotron2Mixin, _CharMixin, Tacotron2TTSBundle):
+    pass
+
+
+@dataclass
+class _Tacotron2WaveRNNPhoneBundle(_WaveRNNMixin, _Tacotron2Mixin, _PhoneMixin, Tacotron2TTSBundle):
+    pass
+
+
+@dataclass
+class _Tacotron2GriffinLimCharBundle(_GriffinLimMixin, _Tacotron2Mixin, _CharMixin, Tacotron2TTSBundle):
+    pass
+
+
+@dataclass
+class _Tacotron2GriffinLimPhoneBundle(_GriffinLimMixin, _Tacotron2Mixin, _PhoneMixin, Tacotron2TTSBundle):
+    pass
+
+
+################################################################################
+# Instantiate bundle objects
+################################################################################
+
+
+TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH = _Tacotron2GriffinLimCharBundle(
+    _tacotron2_path="tacotron2_english_characters_1500_epochs_ljspeech.pth",
+    _tacotron2_params=utils._get_taco_params(n_symbols=38),
+)
+TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.__doc__ = """Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
+:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.
+
+The text processor encodes the input texts character-by-character.
+
+You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
+The default parameters were used.
+
+Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
+
+Example - "Hello world! T T S stands for Text to Speech!"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+
+Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+"""  # noqa: E501
+
+TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH = _Tacotron2GriffinLimPhoneBundle(
+    _tacotron2_path="tacotron2_english_phonemes_1500_epochs_ljspeech.pth",
+    _tacotron2_params=utils._get_taco_params(n_symbols=96),
+)
+TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.__doc__ = """Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and
+:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.
+
+The text processor encodes the input texts based on phoneme.
+It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
+graphemes to phonemes.
+The model (*en_us_cmudict_forward*) was trained on
+`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.
+
+You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
+The text processor is set to the *"english_phonemes"*.
+
+Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
+
+Example - "Hello world! T T S stands for Text to Speech!"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+
+Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+
+"""  # noqa: E501
+
+TACOTRON2_WAVERNN_CHAR_LJSPEECH = _Tacotron2WaveRNNCharBundle(
+    _tacotron2_path="tacotron2_english_characters_1500_epochs_wavernn_ljspeech.pth",
+    _tacotron2_params=utils._get_taco_params(n_symbols=38),
+    _wavernn_path="wavernn_10k_epochs_8bits_ljspeech.pth",
+    _wavernn_params=utils._get_wrnn_params(),
+)
+TACOTRON2_WAVERNN_CHAR_LJSPEECH.__doc__ = """Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and :py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.
+
+The text processor encodes the input texts character-by-character.
+
+You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
+The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
+``mel_fmin=40``, and ``mel_fmax=11025``.
+
+You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.
+
+Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
+
+Example - "Hello world! T T S stands for Text to Speech!"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+
+Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+"""  # noqa: E501
+
+TACOTRON2_WAVERNN_PHONE_LJSPEECH = _Tacotron2WaveRNNPhoneBundle(
+    _tacotron2_path="tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.pth",
+    _tacotron2_params=utils._get_taco_params(n_symbols=96),
+    _wavernn_path="wavernn_10k_epochs_8bits_ljspeech.pth",
+    _wavernn_params=utils._get_wrnn_params(),
+)
+TACOTRON2_WAVERNN_PHONE_LJSPEECH.__doc__ = """Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
+:py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.
+
+The text processor encodes the input texts based on phoneme.
+It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
+graphemes to phonemes.
+The model (*en_us_cmudict_forward*) was trained on
+`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.
+
+You can find the training script for Tacotron2 `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
+The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
+``mel_fmin=40``, and ``mel_fmax=11025``.
+
+You can find the training script for WaveRNN `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.
+
+Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
+
+Example - "Hello world! T T S stands for Text to Speech!"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+
+
+Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+"""  # noqa: E501
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/interface.py b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..273dfca2b14877cebc7cdb0716d60440693a775e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/interface.py
@@ -0,0 +1,255 @@
+from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple, Union
+
+from torch import Tensor
+from torchaudio.models import Tacotron2
+
+
+class _TextProcessor(ABC):
+    @property
+    @abstractmethod
+    def tokens(self):
+        """The tokens that the each value in the processed tensor represent.
+
+        :type: List[str]
+        """
+
+    @abstractmethod
+    def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
+        """Encode the given (batch of) texts into numerical tensors
+
+        Args:
+            text (str or list of str): The input texts.
+
+        Returns:
+            (Tensor, Tensor):
+            Tensor:
+                The encoded texts. Shape: `(batch, max length)`
+            Tensor:
+                The valid length of each sample in the batch. Shape: `(batch, )`.
+        """
+
+
+class _Vocoder(ABC):
+    @property
+    @abstractmethod
+    def sample_rate(self):
+        """The sample rate of the resulting waveform
+
+        :type: float
+        """
+
+    @abstractmethod
+    def __call__(self, specgrams: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        """Generate waveform from the given input, such as spectrogram
+
+        Args:
+            specgrams (Tensor):
+                The input spectrogram. Shape: `(batch, frequency bins, time)`.
+                The expected shape depends on the implementation.
+            lengths (Tensor, or None, optional):
+                The valid length of each sample in the batch. Shape: `(batch, )`.
+                (Default: `None`)
+
+        Returns:
+            (Tensor, Optional[Tensor]):
+            Tensor:
+                The generated waveform. Shape: `(batch, max length)`
+            Tensor or None:
+                The valid length of each sample in the batch. Shape: `(batch, )`.
+        """
+
+
+class Tacotron2TTSBundle(ABC):
+    """Data class that bundles associated information to use pretrained Tacotron2 and vocoder.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    Please see below for the usage and the available values.
+
+    Example - Character-based TTS pipeline with Tacotron2 and WaveRNN
+        >>> import torchaudio
+        >>>
+        >>> text = "Hello, T T S !"
+        >>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
+        >>>
+        >>> # Build processor, Tacotron2 and WaveRNN model
+        >>> processor = bundle.get_text_processor()
+        >>> tacotron2 = bundle.get_tacotron2()
+        Downloading:
+        100%|███████████████████████████████| 107M/107M [00:01<00:00, 87.9MB/s]
+        >>> vocoder = bundle.get_vocoder()
+        Downloading:
+        100%|███████████████████████████████| 16.7M/16.7M [00:00<00:00, 78.1MB/s]
+        >>>
+        >>> # Encode text
+        >>> input, lengths = processor(text)
+        >>>
+        >>> # Generate (mel-scale) spectrogram
+        >>> specgram, lengths, _ = tacotron2.infer(input, lengths)
+        >>>
+        >>> # Convert spectrogram to waveform
+        >>> waveforms, lengths = vocoder(specgram, lengths)
+        >>>
+        >>> torchaudio.save('hello-tts.wav', waveforms, vocoder.sample_rate)
+
+    Example - Phoneme-based TTS pipeline with Tacotron2 and WaveRNN
+        >>>
+        >>> # Note:
+        >>> #     This bundle uses pre-trained DeepPhonemizer as
+        >>> #     the text pre-processor.
+        >>> #     Please install deep-phonemizer.
+        >>> #     See https://github.com/as-ideas/DeepPhonemizer
+        >>> #     The pretrained weight is automatically downloaded.
+        >>>
+        >>> import torchaudio
+        >>>
+        >>> text = "Hello, TTS!"
+        >>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
+        >>>
+        >>> # Build processor, Tacotron2 and WaveRNN model
+        >>> processor = bundle.get_text_processor()
+        Downloading:
+        100%|███████████████████████████████| 63.6M/63.6M [00:04<00:00, 15.3MB/s]
+        >>> tacotron2 = bundle.get_tacotron2()
+        Downloading:
+        100%|███████████████████████████████| 107M/107M [00:01<00:00, 87.9MB/s]
+        >>> vocoder = bundle.get_vocoder()
+        Downloading:
+        100%|███████████████████████████████| 16.7M/16.7M [00:00<00:00, 78.1MB/s]
+        >>>
+        >>> # Encode text
+        >>> input, lengths = processor(text)
+        >>>
+        >>> # Generate (mel-scale) spectrogram
+        >>> specgram, lengths, _ = tacotron2.infer(input, lengths)
+        >>>
+        >>> # Convert spectrogram to waveform
+        >>> waveforms, lengths = vocoder(specgram, lengths)
+        >>>
+        >>> torchaudio.save('hello-tts.wav', waveforms, vocoder.sample_rate)
+    """
+
+    # Using the inner class so that these interfaces are not directly exposed on
+    # `torchaudio.pipelines`, but still listed in documentation.
+    # The thing is, text processing and vocoder are generic and we do not know what kind of
+    # new text processing and vocoder will be added in the future, so we want to make these
+    # interfaces specific to this Tacotron2TTS pipeline.
+
+    class TextProcessor(_TextProcessor):
+        """Interface of the text processing part of Tacotron2TTS pipeline
+
+        See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
+        """
+
+    class Vocoder(_Vocoder):
+        """Interface of the vocoder part of Tacotron2TTS pipeline
+
+        See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
+        """
+
+    @abstractmethod
+    def get_text_processor(self, *, dl_kwargs=None) -> TextProcessor:
+        """Create a text processor
+
+        For character-based pipeline, this processor splits the input text by character.
+        For phoneme-based pipeline, this processor converts the input text (grapheme) to
+        phonemes.
+
+        If a pre-trained weight file is necessary,
+        :func:`torch.hub.download_url_to_file` is used to downloaded it.
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments,):
+                Passed to :func:`torch.hub.download_url_to_file`.
+
+        Returns:
+            TextProcessor:
+                A callable which takes a string or a list of strings as input and
+                returns Tensor of encoded texts and Tensor of valid lengths.
+                The object also has ``tokens`` property, which allows to recover the
+                tokenized form.
+
+        Example - Character-based
+            >>> text = [
+            >>>     "Hello World!",
+            >>>     "Text-to-speech!",
+            >>> ]
+            >>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
+            >>> processor = bundle.get_text_processor()
+            >>> input, lengths = processor(text)
+            >>>
+            >>> print(input)
+            tensor([[19, 16, 23, 23, 26, 11, 34, 26, 29, 23, 15,  2,  0,  0,  0],
+                    [31, 16, 35, 31,  1, 31, 26,  1, 30, 27, 16, 16, 14, 19,  2]],
+                   dtype=torch.int32)
+            >>>
+            >>> print(lengths)
+            tensor([12, 15], dtype=torch.int32)
+            >>>
+            >>> print([processor.tokens[i] for i in input[0, :lengths[0]]])
+            ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']
+            >>> print([processor.tokens[i] for i in input[1, :lengths[1]]])
+            ['t', 'e', 'x', 't', '-', 't', 'o', '-', 's', 'p', 'e', 'e', 'c', 'h', '!']
+
+        Example - Phoneme-based
+            >>> text = [
+            >>>     "Hello, T T S !",
+            >>>     "Text-to-speech!",
+            >>> ]
+            >>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
+            >>> processor = bundle.get_text_processor()
+            Downloading:
+            100%|███████████████████████████████| 63.6M/63.6M [00:04<00:00, 15.3MB/s]
+            >>> input, lengths = processor(text)
+            >>>
+            >>> print(input)
+            tensor([[54, 20, 65, 69, 11, 92, 44, 65, 38,  2,  0,  0,  0,  0],
+                    [81, 40, 64, 79, 81,  1, 81, 20,  1, 79, 77, 59, 37,  2]],
+                   dtype=torch.int32)
+            >>>
+            >>> print(lengths)
+            tensor([10, 14], dtype=torch.int32)
+            >>>
+            >>> print([processor.tokens[i] for i in input[0]])
+            ['HH', 'AH', 'L', 'OW', ' ', 'W', 'ER', 'L', 'D', '!', '_', '_', '_', '_']
+            >>> print([processor.tokens[i] for i in input[1]])
+            ['T', 'EH', 'K', 'S', 'T', '-', 'T', 'AH', '-', 'S', 'P', 'IY', 'CH', '!']
+        """
+
+    @abstractmethod
+    def get_vocoder(self, *, dl_kwargs=None) -> Vocoder:
+        """Create a vocoder module, based off of either WaveRNN or GriffinLim.
+
+        If a pre-trained weight file is necessary,
+        :func:`torch.hub.load_state_dict_from_url` is used to downloaded it.
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments):
+                Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Vocoder:
+                A vocoder module, which takes spectrogram Tensor and an optional
+                length Tensor, then returns resulting waveform Tensor and an optional
+                length Tensor.
+        """
+
+    @abstractmethod
+    def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2:
+        """Create a Tacotron2 model with pre-trained weight.
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments):
+                Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Tacotron2:
+                The resulting model.
+        """
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/utils.py b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c94c21ec519b92647033a81d1bb026e5296ffc64
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/_tts/utils.py
@@ -0,0 +1,228 @@
+import logging
+import os
+
+import torch
+from torchaudio._internal import download_url_to_file, module_utils as _mod_utils
+
+
+def _get_chars():
+    return (
+        "_",
+        "-",
+        "!",
+        "'",
+        "(",
+        ")",
+        ",",
+        ".",
+        ":",
+        ";",
+        "?",
+        " ",
+        "a",
+        "b",
+        "c",
+        "d",
+        "e",
+        "f",
+        "g",
+        "h",
+        "i",
+        "j",
+        "k",
+        "l",
+        "m",
+        "n",
+        "o",
+        "p",
+        "q",
+        "r",
+        "s",
+        "t",
+        "u",
+        "v",
+        "w",
+        "x",
+        "y",
+        "z",
+    )
+
+
+def _get_phones():
+    return (
+        "_",
+        "-",
+        "!",
+        "'",
+        "(",
+        ")",
+        ",",
+        ".",
+        ":",
+        ";",
+        "?",
+        " ",
+        "AA",
+        "AA0",
+        "AA1",
+        "AA2",
+        "AE",
+        "AE0",
+        "AE1",
+        "AE2",
+        "AH",
+        "AH0",
+        "AH1",
+        "AH2",
+        "AO",
+        "AO0",
+        "AO1",
+        "AO2",
+        "AW",
+        "AW0",
+        "AW1",
+        "AW2",
+        "AY",
+        "AY0",
+        "AY1",
+        "AY2",
+        "B",
+        "CH",
+        "D",
+        "DH",
+        "EH",
+        "EH0",
+        "EH1",
+        "EH2",
+        "ER",
+        "ER0",
+        "ER1",
+        "ER2",
+        "EY",
+        "EY0",
+        "EY1",
+        "EY2",
+        "F",
+        "G",
+        "HH",
+        "IH",
+        "IH0",
+        "IH1",
+        "IH2",
+        "IY",
+        "IY0",
+        "IY1",
+        "IY2",
+        "JH",
+        "K",
+        "L",
+        "M",
+        "N",
+        "NG",
+        "OW",
+        "OW0",
+        "OW1",
+        "OW2",
+        "OY",
+        "OY0",
+        "OY1",
+        "OY2",
+        "P",
+        "R",
+        "S",
+        "SH",
+        "T",
+        "TH",
+        "UH",
+        "UH0",
+        "UH1",
+        "UH2",
+        "UW",
+        "UW0",
+        "UW1",
+        "UW2",
+        "V",
+        "W",
+        "Y",
+        "Z",
+        "ZH",
+    )
+
+
+def _to_tensor(indices):
+    lengths = torch.tensor([len(i) for i in indices], dtype=torch.int32)
+    values = [torch.tensor(i) for i in indices]
+    values = torch.nn.utils.rnn.pad_sequence(values, batch_first=True)
+    return values, lengths
+
+
+def _load_phonemizer(file, dl_kwargs):
+    if not _mod_utils.is_module_available("dp"):
+        raise RuntimeError("DeepPhonemizer is not installed. Please install it.")
+
+    from dp.phonemizer import Phonemizer
+
+    # By default, dp issues DEBUG level log.
+    logger = logging.getLogger("dp")
+    orig_level = logger.level
+    logger.setLevel(logging.INFO)
+    try:
+        url = f"https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/{file}"
+        directory = os.path.join(torch.hub.get_dir(), "checkpoints")
+        os.makedirs(directory, exist_ok=True)
+        path = os.path.join(directory, file)
+        if not os.path.exists(path):
+            dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+            download_url_to_file(url, path, **dl_kwargs)
+        return Phonemizer.from_checkpoint(path)
+    finally:
+        logger.setLevel(orig_level)
+
+
+def _unnormalize_waveform(waveform: torch.Tensor, bits: int) -> torch.Tensor:
+    r"""Transform waveform [-1, 1] to label [0, 2 ** bits - 1]"""
+    waveform = torch.clamp(waveform, -1, 1)
+    waveform = (waveform + 1.0) * (2**bits - 1) / 2
+    return torch.clamp(waveform, 0, 2**bits - 1).int()
+
+
+def _get_taco_params(n_symbols):
+    return {
+        "mask_padding": False,
+        "n_mels": 80,
+        "n_frames_per_step": 1,
+        "symbol_embedding_dim": 512,
+        "encoder_embedding_dim": 512,
+        "encoder_n_convolution": 3,
+        "encoder_kernel_size": 5,
+        "decoder_rnn_dim": 1024,
+        "decoder_max_step": 2000,
+        "decoder_dropout": 0.1,
+        "decoder_early_stopping": True,
+        "attention_rnn_dim": 1024,
+        "attention_hidden_dim": 128,
+        "attention_location_n_filter": 32,
+        "attention_location_kernel_size": 31,
+        "attention_dropout": 0.1,
+        "prenet_dim": 256,
+        "postnet_n_convolution": 5,
+        "postnet_kernel_size": 5,
+        "postnet_embedding_dim": 512,
+        "gate_threshold": 0.5,
+        "n_symbol": n_symbols,
+    }
+
+
+def _get_wrnn_params():
+    return {
+        "upsample_scales": [5, 5, 11],
+        "n_classes": 2**8,  # n_bits = 8
+        "hop_length": 275,
+        "n_res_block": 10,
+        "n_rnn": 512,
+        "n_fc": 512,
+        "kernel_size": 5,
+        "n_freq": 80,
+        "n_hidden": 128,
+        "n_output": 128,
+    }
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__init__.py b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ec01f0f710ea00f24249785e2fdeee491fa6609
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/aligner.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/aligner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b29ff49ea646691eba282c444d4d2df3c7b92dc
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/aligner.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/impl.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4af01ae6561006745b8544e3fd854c4a9ec3fda7
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ef4384c6503a414ed2df437de5bf742af30c051
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/aligner.py b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/aligner.py
new file mode 100644
index 0000000000000000000000000000000000000000..f23b9cf65c733d39f13524171474f324666e22dd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/aligner.py
@@ -0,0 +1,87 @@
+from abc import ABC, abstractmethod
+from typing import Dict, List
+
+import torch
+import torchaudio.functional as F
+from torch import Tensor
+from torchaudio.functional import TokenSpan
+
+
+class ITokenizer(ABC):
+    @abstractmethod
+    def __call__(self, transcript: List[str]) -> List[List[str]]:
+        """Tokenize the given transcript (list of word)
+
+        .. note::
+
+           The toranscript must be normalized.
+
+        Args:
+            transcript (list of str): Transcript (list of word).
+
+        Returns:
+            (list of int): List of token sequences
+        """
+
+
+class Tokenizer(ITokenizer):
+    def __init__(self, dictionary: Dict[str, int]):
+        self.dictionary = dictionary
+
+    def __call__(self, transcript: List[str]) -> List[List[int]]:
+        return [[self.dictionary[c] for c in word] for word in transcript]
+
+
+def _align_emission_and_tokens(emission: Tensor, tokens: List[int], blank: int = 0):
+    device = emission.device
+    emission = emission.unsqueeze(0)
+    targets = torch.tensor([tokens], dtype=torch.int32, device=device)
+
+    aligned_tokens, scores = F.forced_align(emission, targets, blank=blank)
+
+    scores = scores.exp()  # convert back to probability
+    aligned_tokens, scores = aligned_tokens[0], scores[0]  # remove batch dimension
+    return aligned_tokens, scores
+
+
+class IAligner(ABC):
+    @abstractmethod
+    def __call__(self, emission: Tensor, tokens: List[List[int]]) -> List[List[TokenSpan]]:
+        """Generate list of time-stamped token sequences
+
+        Args:
+            emission (Tensor): Sequence of token probability distributions in log-domain.
+                Shape: `(time, tokens)`.
+            tokens (list of integer sequence): Tokenized transcript.
+                Output from :py:class:`torchaudio.pipelines.Wav2Vec2FABundle.Tokenizer`.
+
+        Returns:
+            (list of TokenSpan sequence): Tokens with time stamps and scores.
+        """
+
+
+def _unflatten(list_, lengths):
+    assert len(list_) == sum(lengths)
+    i = 0
+    ret = []
+    for l in lengths:
+        ret.append(list_[i : i + l])
+        i += l
+    return ret
+
+
+def _flatten(nested_list):
+    return [item for list_ in nested_list for item in list_]
+
+
+class Aligner(IAligner):
+    def __init__(self, blank):
+        self.blank = blank
+
+    def __call__(self, emission: Tensor, tokens: List[List[int]]) -> List[List[TokenSpan]]:
+        if emission.ndim != 2:
+            raise ValueError(f"The input emission must be 2D. Found: {emission.shape}")
+
+        aligned_tokens, scores = _align_emission_and_tokens(emission, _flatten(tokens), self.blank)
+        spans = F.merge_tokens(aligned_tokens, scores)
+        return _unflatten(spans, [len(ts) for ts in tokens])
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/impl.py b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..be21da436024275dae50e5b7fd22e351ab9b8e5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/impl.py
@@ -0,0 +1,1699 @@
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple
+
+from torch.nn import Module
+
+from . import aligner, utils
+
+
+__all__ = []  # type: ignore
+
+
+@dataclass
+class Wav2Vec2Bundle:
+    """Data class that bundles associated information to use pretrained :py:class:`~torchaudio.models.Wav2Vec2Model`.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    Please see below for the usage and the available values.
+
+    Example - Feature Extraction
+        >>> import torchaudio
+        >>>
+        >>> bundle = torchaudio.pipelines.HUBERT_BASE
+        >>>
+        >>> # Build the model and load pretrained weight.
+        >>> model = bundle.get_model()
+        Downloading:
+        100%|███████████████████████████████| 360M/360M [00:06<00:00, 60.6MB/s]
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Extract acoustic features
+        >>> features, _ = model.extract_features(waveform)
+    """  # noqa: E501
+
+    _path: str
+    _params: Dict[str, Any]
+    _sample_rate: float
+    _normalize_waveform: bool
+    _model_type: str
+
+    @property
+    def sample_rate(self) -> float:
+        """Sample rate of the audio that the model is trained on.
+
+        :type: float
+        """
+        return self._sample_rate
+
+    def _get_state_dict(self, dl_kwargs):
+        # Note: This method is overridden in ASR bundle
+        return utils._get_state_dict(self._path, dl_kwargs)
+
+    def get_model(self, *, dl_kwargs=None) -> Module:
+        """Construct the model and load the pretrained weight.
+
+        The weight file is downloaded from the internet and cached with
+        :func:`torch.hub.load_state_dict_from_url`
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Variation of :py:class:`~torchaudio.models.Wav2Vec2Model`.
+
+            For the models listed below, an additional layer normalization is performed on the input.
+
+            For all other models, a :py:class:`~torchaudio.models.Wav2Vec2Model` instance is returned.
+
+            - WAV2VEC2_LARGE_LV60K
+            - WAV2VEC2_ASR_LARGE_LV60K_10M
+            - WAV2VEC2_ASR_LARGE_LV60K_100H
+            - WAV2VEC2_ASR_LARGE_LV60K_960H
+            - WAV2VEC2_XLSR53
+            - WAV2VEC2_XLSR_300M
+            - WAV2VEC2_XLSR_1B
+            - WAV2VEC2_XLSR_2B
+            - HUBERT_LARGE
+            - HUBERT_XLARGE
+            - HUBERT_ASR_LARGE
+            - HUBERT_ASR_XLARGE
+            - WAVLM_LARGE
+        """
+        model = utils._get_model(self._model_type, self._params)
+        state_dict = self._get_state_dict(dl_kwargs)
+        model.load_state_dict(state_dict)
+        if self._normalize_waveform:
+            model = utils._extend_model(model, normalize_waveform=True)
+        model.eval()
+        return model
+
+
+@dataclass
+class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
+    """Data class that bundles associated information to use pretrained
+    :py:class:`~torchaudio.models.Wav2Vec2Model`.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    Please see below for the usage and the available values.
+
+    Example - ASR
+        >>> import torchaudio
+        >>>
+        >>> bundle = torchaudio.pipelines.HUBERT_ASR_LARGE
+        >>>
+        >>> # Build the model and load pretrained weight.
+        >>> model = bundle.get_model()
+        Downloading:
+        100%|███████████████████████████████| 1.18G/1.18G [00:17<00:00, 73.8MB/s]
+        >>>
+        >>> # Check the corresponding labels of the output.
+        >>> labels = bundle.get_labels()
+        >>> print(labels)
+        ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Infer the label probability distribution
+        >>> emissions, _ = model(waveform)
+        >>>
+        >>> # Pass emission to decoder
+        >>> # `ctc_decode` is for illustration purpose only
+        >>> transcripts = ctc_decode(emissions, labels)
+    """  # noqa: E501
+
+    _labels: Tuple[str, ...]
+    _remove_aux_axis: Tuple[int, ...] = (1, 2, 3)
+
+    def get_labels(
+        self,
+        *,
+        blank: str = "-",
+    ) -> Tuple[str, ...]:
+        """The output class labels.
+
+        The first is blank token, and it is customizable.
+
+        Args:
+            blank (str, optional): Blank token. (default: ``'-'``)
+
+        Returns:
+            Tuple[str, ...]:
+            For models fine-tuned on ASR, returns the tuple of strings representing
+            the output class labels.
+
+        Example
+            >>> from torchaudio.pipelines import HUBERT_ASR_LARGE as bundle
+            >>> bundle.get_labels()
+            ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
+        """  # noqa: E501
+        return (blank, *self._labels)
+
+    def _get_state_dict(self, dl_kwargs):
+        return utils._get_state_dict(self._path, dl_kwargs, self._remove_aux_axis)
+
+
+WAV2VEC2_BASE = Wav2Vec2Bundle(
+    _path="wav2vec2_fairseq_base_ls960.pth",
+    _params={
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_BASE.__doc__ = """Wav2vec 2.0 model ("base" architecture),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_BASE_10M = Wav2Vec2ASRBundle(
+    _path="wav2vec2_fairseq_base_ls960_asr_ll10m.pth",
+    _params={
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_BASE_10M.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
+:cite:`librilight` ("train-10min" subset).
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_BASE_100H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_base_ls960_asr_ls100.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+
+WAV2VEC2_ASR_BASE_100H.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 100 hours of transcribed audio from "train-clean-100" subset.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_BASE_960H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_base_ls960_asr_ls960.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_BASE_960H.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on the same audio with the corresponding transcripts.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_LARGE = Wav2Vec2Bundle(
+    "wav2vec2_fairseq_large_ls960.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_LARGE.__doc__ = """Wav2vec 2.0 model ("large" architecture),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_10M = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_ls960_asr_ll10m.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_10M.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
+:cite:`librilight` ("train-10min" subset).
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_100H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_ls960_asr_ls100.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_100H.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 100 hours of transcribed audio from
+the same dataset ("train-clean-100" subset).
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_960H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_ls960_asr_ls960.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_960H.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on the same audio with the corresponding transcripts.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa:  E501
+
+WAV2VEC2_LARGE_LV60K = Wav2Vec2Bundle(
+    "wav2vec2_fairseq_large_lv60k.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_LARGE_LV60K.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
+not fine-tuned.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_LV60K_10M = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_lv60k_asr_ll10m.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_LV60K_10M.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
+fine-tuned for ASR on 10 minutes of transcribed audio from the same dataset ("train-10min" subset).
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_LV60K_100H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_lv60k_asr_ls100.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_LV60K_100H.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
+fine-tuned for ASR on 100 hours of transcribed audio from
+*LibriSpeech* dataset :cite:`7178964` ("train-clean-100" subset).
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_LV60K_960H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_lv60k_asr_ls960.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_LV60K_960H.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* :cite:`librilight` dataset, and
+fine-tuned for ASR on 960 hours of transcribed audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_XLSR53 = Wav2Vec2Bundle(
+    "wav2vec2_fairseq_large_xlsr53.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_XLSR53.__doc__ = """Wav2vec 2.0 model ("base" architecture),
+pre-trained on 56,000 hours of unlabeled audio from multiple datasets (
+*Multilingual LibriSpeech* :cite:`Pratap_2020`,
+*CommonVoice* :cite:`ardila2020common` and
+*BABEL* :cite:`Gales2014SpeechRA`),
+not fine-tuned.
+
+Originally published by the authors of
+*Unsupervised Cross-lingual Representation Learning for Speech Recognition*
+:cite:`conneau2020unsupervised` under MIT License and redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+HUBERT_BASE = Wav2Vec2Bundle(
+    "hubert_fairseq_base_ls960.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+HUBERT_BASE.__doc__ = """HuBERT model ("base" architecture),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
+
+Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+HUBERT_LARGE = Wav2Vec2Bundle(
+    "hubert_fairseq_large_ll60k.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+HUBERT_LARGE.__doc__ = """HuBERT model ("large" architecture),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
+not fine-tuned.
+
+Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+HUBERT_XLARGE = Wav2Vec2Bundle(
+    "hubert_fairseq_xlarge_ll60k.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1280,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 48,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 5120,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+HUBERT_XLARGE.__doc__ = """HuBERT model ("extra large" architecture),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
+not fine-tuned.
+
+Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+HUBERT_ASR_LARGE = Wav2Vec2ASRBundle(
+    "hubert_fairseq_large_ll60k_asr_ls960.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+HUBERT_ASR_LARGE.__doc__ = """HuBERT model ("large" architecture),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
+fine-tuned for ASR on 960 hours of transcribed audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+
+Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+HUBERT_ASR_XLARGE = Wav2Vec2ASRBundle(
+    "hubert_fairseq_xlarge_ll60k_asr_ls960.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1280,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 48,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 5120,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+HUBERT_ASR_XLARGE.__doc__ = """HuBERT model ("extra large" architecture),
+pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset :cite:`librilight`, and
+fine-tuned for ASR on 960 hours of transcribed audio from
+*LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+
+Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
+VOXPOPULI_ASR_BASE_10K_DE = Wav2Vec2ASRBundle(
+    "wav2vec2_voxpopuli_base_10k_asr_de.pt",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 32,
+    },
+    _labels=utils._get_de_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _remove_aux_axis=(1, 2, 3, 35),
+    _model_type="Wav2Vec2",
+)
+VOXPOPULI_ASR_BASE_10K_DE.__doc__ = """wav2vec 2.0 model ("base" architecture),
+pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
+("10k" subset, consisting of 23 languages), and
+fine-tuned for ASR on 282 hours of transcribed audio from "de" subset.
+
+Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
+VOXPOPULI_ASR_BASE_10K_EN = Wav2Vec2ASRBundle(
+    "wav2vec2_voxpopuli_base_10k_asr_en.pt",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 28,
+    },
+    _labels=utils._get_vp_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _remove_aux_axis=(1, 2, 3, 31),
+    _model_type="Wav2Vec2",
+)
+VOXPOPULI_ASR_BASE_10K_EN.__doc__ = """wav2vec 2.0 model ("base" architecture),
+pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
+("10k" subset, consisting of 23 languages), and
+fine-tuned for ASR on 543 hours of transcribed audio from "en" subset.
+
+Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
+VOXPOPULI_ASR_BASE_10K_ES = Wav2Vec2ASRBundle(
+    "wav2vec2_voxpopuli_base_10k_asr_es.pt",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 35,
+    },
+    _labels=utils._get_es_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _remove_aux_axis=(1, 2, 3, 35),
+    _model_type="Wav2Vec2",
+)
+VOXPOPULI_ASR_BASE_10K_ES.__doc__ = """wav2vec 2.0 model ("base" architecture),
+pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
+("10k" subset, consisting of 23 languages), and
+fine-tuned for ASR on 166 hours of transcribed audio from "es" subset.
+
+Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+VOXPOPULI_ASR_BASE_10K_FR = Wav2Vec2ASRBundle(
+    "wav2vec2_voxpopuli_base_10k_asr_fr.pt",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 43,
+    },
+    _labels=utils._get_fr_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+VOXPOPULI_ASR_BASE_10K_FR.__doc__ = """wav2vec 2.0 model ("base" architecture),
+pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
+("10k" subset, consisting of 23 languages), and
+fine-tuned for ASR on 211 hours of transcribed audio from "fr" subset.
+
+Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
+VOXPOPULI_ASR_BASE_10K_IT = Wav2Vec2ASRBundle(
+    "wav2vec2_voxpopuli_base_10k_asr_it.pt",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 37,
+    },
+    _labels=utils._get_it_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _remove_aux_axis=(1, 2, 3),
+    _model_type="Wav2Vec2",
+)
+VOXPOPULI_ASR_BASE_10K_IT.__doc__ = """wav2vec 2.0 model ("base" architecture),
+pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
+("10k" subset, consisting of 23 languages), and
+fine-tuned for ASR on 91 hours of transcribed audio from "it" subset.
+
+Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
+WAVLM_BASE = Wav2Vec2Bundle(
+    "wavlm_base.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_max_distance": 800,
+        "encoder_num_buckets": 320,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": None,
+    },
+    _model_type="WavLM",
+    _sample_rate=16000,
+    _normalize_waveform=False,
+)
+WAVLM_BASE.__doc__ = """WavLM Base model ("base" architecture),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`, not fine-tuned.
+
+Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
+`Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+
+WAVLM_BASE_PLUS = Wav2Vec2Bundle(
+    "wavlm_base_plus.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_max_distance": 800,
+        "encoder_num_buckets": 320,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": None,
+    },
+    _model_type="WavLM",
+    _sample_rate=16000,
+    _normalize_waveform=False,
+)
+WAVLM_BASE_PLUS.__doc__ = """WavLM Base+ model ("base" architecture),
+pre-trained on 60,000 hours of Libri-Light dataset :cite:`librilight`, 10,000 hours of GigaSpeech :cite:`GigaSpeech2021`,
+and 24,000 hours of *VoxPopuli* :cite:`voxpopuli`, not fine-tuned.
+
+Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
+`Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+
+WAVLM_LARGE = Wav2Vec2Bundle(
+    "wavlm_large.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_max_distance": 800,
+        "encoder_num_buckets": 320,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": None,
+    },
+    _model_type="WavLM",
+    _sample_rate=16000,
+    _normalize_waveform=True,
+)
+WAVLM_LARGE.__doc__ = """WavLM Large model ("large" architecture),
+pre-trained on 60,000 hours of Libri-Light dataset :cite:`librilight`, 10,000 hours of GigaSpeech :cite:`GigaSpeech2021`,
+and 24,000 hours of *VoxPopuli* :cite:`voxpopuli`, not fine-tuned.
+
+Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
+`Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+
+WAV2VEC2_XLSR_300M = Wav2Vec2Bundle(
+    "wav2vec2_xlsr_300m.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _model_type="Wav2Vec2",
+    _sample_rate=16000,
+    _normalize_waveform=True,
+)
+WAV2VEC2_XLSR_300M.__doc__ = """XLS-R model with 300 million parameters,
+pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
+*Multilingual LibriSpeech* :cite:`Pratap_2020`,
+*CommonVoice* :cite:`ardila2020common`,
+*VoxLingua107* :cite:`valk2021voxlingua107`,
+*BABEL* :cite:`Gales2014SpeechRA`, and
+*VoxPopuli* :cite:`voxpopuli`) in 128 languages,
+not fine-tuned.
+
+Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
+`Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
+"""  # noqa: E501
+
+
+WAV2VEC2_XLSR_1B = Wav2Vec2Bundle(
+    "wav2vec2_xlsr_1b.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1280,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 48,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 5120,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _model_type="Wav2Vec2",
+    _sample_rate=16000,
+    _normalize_waveform=True,
+)
+WAV2VEC2_XLSR_1B.__doc__ = """XLS-R model with 1 billion parameters,
+pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
+*Multilingual LibriSpeech* :cite:`Pratap_2020`,
+*CommonVoice* :cite:`ardila2020common`,
+*VoxLingua107* :cite:`valk2021voxlingua107`,
+*BABEL* :cite:`Gales2014SpeechRA`, and
+*VoxPopuli* :cite:`voxpopuli`) in 128 languages,
+not fine-tuned.
+
+Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
+`Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
+"""  # noqa: E501
+
+WAV2VEC2_XLSR_2B = Wav2Vec2Bundle(
+    "wav2vec2_xlsr_2b.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1920,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 48,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 7680,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _model_type="Wav2Vec2",
+    _sample_rate=16000,
+    _normalize_waveform=True,
+)
+WAV2VEC2_XLSR_2B.__doc__ = """XLS-R model with 2 billion parameters,
+pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
+*Multilingual LibriSpeech* :cite:`Pratap_2020`,
+*CommonVoice* :cite:`ardila2020common`,
+*VoxLingua107* :cite:`valk2021voxlingua107`,
+*BABEL* :cite:`Gales2014SpeechRA`, and
+*VoxPopuli* :cite:`voxpopuli`) in 128 languages,
+not fine-tuned.
+
+Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
+`Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
+"""  # noqa: E501
+
+
+@dataclass
+class Wav2Vec2FABundle(Wav2Vec2ASRBundle):
+    """Data class that bundles associated information to use pretrained :py:class:`~torchaudio.models.Wav2Vec2Model` for forced alignment.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    Please see below for the usage and the available values.
+
+    Example - Feature Extraction
+        >>> import torchaudio
+        >>>
+        >>> bundle = torchaudio.pipelines.MMS_FA
+        >>>
+        >>> # Build the model and load pretrained weight.
+        >>> model = bundle.get_model()
+        Downloading:
+        100%|███████████████████████████████| 1.18G/1.18G [00:05<00:00, 216MB/s]
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Estimate the probability of token distribution
+        >>> emission, _ = model(waveform)
+        >>>
+        >>> # Generate frame-wise alignment
+        >>> alignment, scores = torchaudio.functional.forced_align(
+        >>>     emission, targets, input_lengths, target_lengths, blank=0)
+        >>>
+    """  # noqa: E501
+
+    class Tokenizer(aligner.ITokenizer):
+        """Interface of the tokenizer"""
+
+    class Aligner(aligner.IAligner):
+        """Interface of the aligner"""
+
+    def get_labels(self, star: Optional[str] = "*", blank: str = "-") -> Tuple[str, ...]:
+        """Get the labels corresponding to the feature dimension of emission.
+
+        The first is blank token, and it is customizable.
+
+        Args:
+            star (str or None, optional): Change or disable star token. (default: ``"*"``)
+            blank (str, optional): Change the blank token. (default: ``'-'``)
+
+        Returns:
+            Tuple[str, ...]:
+            For models fine-tuned on ASR, returns the tuple of strings representing
+            the output class labels.
+
+        Example
+            >>> from torchaudio.pipelines import MMS_FA as bundle
+            >>> bundle.get_labels()
+            ('-', 'a', 'i', 'e', 'n', 'o', 'u', 't', 's', 'r', 'm', 'k', 'l', 'd', 'g', 'h', 'y', 'b', 'p', 'w', 'c', 'v', 'j', 'z', 'f', "'", 'q', 'x', '*')
+            >>> bundle.get_labels(star=None)
+            ('-', 'a', 'i', 'e', 'n', 'o', 'u', 't', 's', 'r', 'm', 'k', 'l', 'd', 'g', 'h', 'y', 'b', 'p', 'w', 'c', 'v', 'j', 'z', 'f', "'", 'q', 'x')
+        """  # noqa: E501
+        labels = super().get_labels(blank=blank)
+        return labels if star is None else (*labels, star)
+
+    def get_model(self, with_star: bool = True, *, dl_kwargs=None) -> Module:
+        """Construct the model and load the pretrained weight.
+
+        The weight file is downloaded from the internet and cached with
+        :func:`torch.hub.load_state_dict_from_url`
+
+        Args:
+            with_star (bool, optional): If enabled, the last dimension of output layer is
+                extended by one, which corresponds to `star` token.
+            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Variation of :py:class:`~torchaudio.models.Wav2Vec2Model`.
+
+            .. note::
+
+               The model created with this method returns probability in log-domain,
+               (i.e. :py:func:`torch.nn.functional.log_softmax` is applied), whereas
+               the other Wav2Vec2 models returns logit.
+        """
+        model = utils._get_model(self._model_type, self._params)
+        state_dict = utils._get_state_dict(self._path, dl_kwargs, self._remove_aux_axis)
+        model.load_state_dict(state_dict)
+        model = utils._extend_model(
+            model, normalize_waveform=self._normalize_waveform, apply_log_softmax=True, append_star=with_star
+        )
+        model.eval()
+        return model
+
+    def get_dict(self, star: Optional[str] = "*", blank: str = "-") -> Dict[str, int]:
+        """Get the mapping from token to index (in emission feature dim)
+
+        Args:
+            star (str or None, optional): Change or disable star token. (default: ``"*"``)
+            blank (str, optional): Change the blank token. (default: ``'-'``)
+
+        Returns:
+            Tuple[str, ...]:
+            For models fine-tuned on ASR, returns the tuple of strings representing
+            the output class labels.
+
+        Example
+            >>> from torchaudio.pipelines import MMS_FA as bundle
+            >>> bundle.get_dict()
+            {'-': 0, 'a': 1, 'i': 2, 'e': 3, 'n': 4, 'o': 5, 'u': 6, 't': 7, 's': 8, 'r': 9, 'm': 10, 'k': 11, 'l': 12, 'd': 13, 'g': 14, 'h': 15, 'y': 16, 'b': 17, 'p': 18, 'w': 19, 'c': 20, 'v': 21, 'j': 22, 'z': 23, 'f': 24, "'": 25, 'q': 26, 'x': 27, '*': 28}
+            >>> bundle.get_dict(star=None)
+            {'-': 0, 'a': 1, 'i': 2, 'e': 3, 'n': 4, 'o': 5, 'u': 6, 't': 7, 's': 8, 'r': 9, 'm': 10, 'k': 11, 'l': 12, 'd': 13, 'g': 14, 'h': 15, 'y': 16, 'b': 17, 'p': 18, 'w': 19, 'c': 20, 'v': 21, 'j': 22, 'z': 23, 'f': 24, "'": 25, 'q': 26, 'x': 27}
+        """  # noqa: E501
+        return {k: i for i, k in enumerate(self.get_labels(star=star, blank=blank))}
+
+    def get_tokenizer(self) -> Tokenizer:
+        """Instantiate a Tokenizer.
+
+        Returns:
+            Tokenizer
+        """
+        return aligner.Tokenizer(self.get_dict())
+
+    def get_aligner(self) -> Aligner:
+        """Instantiate an Aligner.
+
+        Returns:
+            Aligner
+        """
+        return aligner.Aligner(blank=0)
+
+
+MMS_FA = Wav2Vec2FABundle(
+    "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 28,
+    },
+    _labels=utils._get_mms_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+MMS_FA.__doc__ = """
+Trained on 31K hours of data in 1,130 languages from *Scaling Speech Technology to 1,000+ Languages* :cite:`pratap2023scaling`.
+
+Published by the authors of *Scaling Speech Technology to 1,000+ Languages* :cite:`pratap2023scaling` under [`CC-BY-NC 4.0 License <https://github.com/facebookresearch/fairseq/tree/100cd91db19bb27277a06a25eb4154c805b10189/examples/mms#license>`__].
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2FABundle` for usage details.
+
+.. note::
+
+   Unlike other Wav2Vec2 bundles, this model does not have a token for word boundary (like `|`). This makes the post-processing of alignments slightly different.
+"""  # noqa: E501
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/utils.py b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6deab5606e7f0332fb182ffd8d7711ef6b366b0f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/_wav2vec2/utils.py
@@ -0,0 +1,346 @@
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn, Tensor
+
+from torchaudio._internal import load_state_dict_from_url
+from torchaudio.models import wav2vec2_model, Wav2Vec2Model, wavlm_model
+
+
+def _get_model(type_, params):
+    factories = {
+        "Wav2Vec2": wav2vec2_model,
+        "WavLM": wavlm_model,
+    }
+    if type_ not in factories:
+        raise ValueError(f"Supported model types are {tuple(factories.keys())}. Found: {type_}")
+    factory = factories[type_]
+    return factory(**params)
+
+
+class _Wav2Vec2Model(nn.Module):
+    """Wrapper class for :py:class:`~torchaudio.models.Wav2Vec2Model`.
+
+    This is used for layer normalization at the input
+    """
+
+    def __init__(self, model: Wav2Vec2Model, normalize_waveform: bool, apply_log_softmax: bool, append_star: bool):
+        super().__init__()
+        self.model = model
+        self.normalize_waveform = normalize_waveform
+        self.apply_log_softmax = apply_log_softmax
+        self.append_star = append_star
+
+    def forward(self, waveforms: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        if self.normalize_waveform:
+            waveforms = nn.functional.layer_norm(waveforms, waveforms.shape)
+        output, output_lengths = self.model(waveforms, lengths)
+        if self.apply_log_softmax:
+            output = torch.nn.functional.log_softmax(output, dim=-1)
+        if self.append_star:
+            star_dim = torch.zeros((1, output.size(1), 1), dtype=output.dtype, device=output.device)
+            output = torch.cat((output, star_dim), dim=-1)
+        return output, output_lengths
+
+    @torch.jit.export
+    def extract_features(
+        self,
+        waveforms: Tensor,
+        lengths: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> Tuple[List[Tensor], Optional[Tensor]]:
+        if self.normalize_waveform:
+            waveforms = nn.functional.layer_norm(waveforms, waveforms.shape)
+        return self.model.extract_features(waveforms, lengths, num_layers)
+
+
+def _extend_model(module, normalize_waveform, apply_log_softmax=False, append_star=False):
+    """Add extra transformations to the model"""
+    return _Wav2Vec2Model(module, normalize_waveform, apply_log_softmax, append_star)
+
+
+def _remove_aux_axes(state_dict, axes):
+    # Remove the seemingly unnecessary axis
+    # For ASR task, the pretrained weights originated from fairseq has unrelated dimensions at index 1, 2, 3
+    # It's originated from the Dictionary implementation of fairseq, which was intended for NLP tasks,
+    # but not used during the ASR training.
+    # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/data/dictionary.py#L21-L37
+    # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/criterions/ctc.py#L126-L129
+    #
+    # Also, some pretrained weights originated from voxpopuli has an extra dimensions that almost never used and
+    # that resembles mistake.
+    # The label `1` shows up in the training dataset of German (1 out of 16M),
+    # English (1 / 28M), Spanish (1 / 9.4M), Romanian (1 / 4.7M) and Polish (6 / 5.8M)
+    for key in ["aux.weight", "aux.bias"]:
+        mat = state_dict[key]
+        state_dict[key] = torch.stack([mat[i] for i in range(mat.size(0)) if i not in axes])
+
+
+def _get_state_dict(url, dl_kwargs, remove_axes=None):
+    if not url.startswith("https"):
+        url = f"https://download.pytorch.org/torchaudio/models/{url}"
+    dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+    state_dict = load_state_dict_from_url(url, **dl_kwargs)
+    if remove_axes:
+        _remove_aux_axes(state_dict, remove_axes)
+    return state_dict
+
+
+def _get_en_labels():
+    return (
+        "|",
+        "E",
+        "T",
+        "A",
+        "O",
+        "N",
+        "I",
+        "H",
+        "S",
+        "R",
+        "D",
+        "L",
+        "U",
+        "M",
+        "W",
+        "C",
+        "F",
+        "G",
+        "Y",
+        "P",
+        "B",
+        "V",
+        "K",
+        "'",
+        "X",
+        "J",
+        "Q",
+        "Z",
+    )
+
+
+def _get_de_labels():
+    return (
+        "|",
+        "e",
+        "n",
+        "i",
+        "r",
+        "s",
+        "t",
+        "a",
+        "d",
+        "h",
+        "u",
+        "l",
+        "g",
+        "c",
+        "m",
+        "o",
+        "b",
+        "w",
+        "f",
+        "k",
+        "z",
+        "p",
+        "v",
+        "ü",
+        "ä",
+        "ö",
+        "j",
+        "ß",
+        "y",
+        "x",
+        "q",
+    )
+
+
+def _get_vp_en_labels():
+    return (
+        "|",
+        "e",
+        "t",
+        "o",
+        "i",
+        "a",
+        "n",
+        "s",
+        "r",
+        "h",
+        "l",
+        "d",
+        "c",
+        "u",
+        "m",
+        "p",
+        "f",
+        "g",
+        "w",
+        "y",
+        "b",
+        "v",
+        "k",
+        "x",
+        "j",
+        "q",
+        "z",
+    )
+
+
+def _get_es_labels():
+    return (
+        "|",
+        "e",
+        "a",
+        "o",
+        "s",
+        "n",
+        "r",
+        "i",
+        "l",
+        "d",
+        "c",
+        "t",
+        "u",
+        "p",
+        "m",
+        "b",
+        "q",
+        "y",
+        "g",
+        "v",
+        "h",
+        "ó",
+        "f",
+        "í",
+        "á",
+        "j",
+        "z",
+        "ñ",
+        "é",
+        "x",
+        "ú",
+        "k",
+        "w",
+        "ü",
+    )
+
+
+def _get_fr_labels():
+    return (
+        "|",
+        "e",
+        "s",
+        "n",
+        "i",
+        "t",
+        "r",
+        "a",
+        "o",
+        "u",
+        "l",
+        "d",
+        "c",
+        "p",
+        "m",
+        "é",
+        "v",
+        "q",
+        "f",
+        "g",
+        "b",
+        "h",
+        "x",
+        "à",
+        "j",
+        "è",
+        "y",
+        "ê",
+        "z",
+        "ô",
+        "k",
+        "ç",
+        "œ",
+        "û",
+        "ù",
+        "î",
+        "â",
+        "w",
+        "ï",
+        "ë",
+        "ü",
+        "æ",
+    )
+
+
+def _get_it_labels():
+    return (
+        "|",
+        "e",
+        "i",
+        "a",
+        "o",
+        "n",
+        "t",
+        "r",
+        "l",
+        "s",
+        "c",
+        "d",
+        "u",
+        "p",
+        "m",
+        "g",
+        "v",
+        "h",
+        "z",
+        "f",
+        "b",
+        "q",
+        "à",
+        "è",
+        "ù",
+        "é",
+        "ò",
+        "ì",
+        "k",
+        "y",
+        "x",
+        "w",
+        "j",
+        "ó",
+        "í",
+        "ï",
+    )
+
+
+def _get_mms_labels():
+    return (
+        "a",
+        "i",
+        "e",
+        "n",
+        "o",
+        "u",
+        "t",
+        "s",
+        "r",
+        "m",
+        "k",
+        "l",
+        "d",
+        "g",
+        "h",
+        "y",
+        "b",
+        "p",
+        "w",
+        "c",
+        "v",
+        "j",
+        "z",
+        "f",
+        "'",
+        "q",
+        "x",
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/pipelines/rnnt_pipeline.py b/MLPY/Lib/site-packages/torchaudio/pipelines/rnnt_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a6f50941113d181e9950b3a3c7eadb9c1359a01
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/pipelines/rnnt_pipeline.py
@@ -0,0 +1,380 @@
+import json
+import math
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable, List, Tuple
+
+import torch
+import torchaudio
+from torchaudio._internal import module_utils
+from torchaudio.models import emformer_rnnt_base, RNNT, RNNTBeamSearch
+
+
+__all__ = []
+
+_decibel = 2 * 20 * math.log10(torch.iinfo(torch.int16).max)
+_gain = pow(10, 0.05 * _decibel)
+
+
+def _piecewise_linear_log(x):
+    x[x > math.e] = torch.log(x[x > math.e])
+    x[x <= math.e] = x[x <= math.e] / math.e
+    return x
+
+
+class _FunctionalModule(torch.nn.Module):
+    def __init__(self, functional):
+        super().__init__()
+        self.functional = functional
+
+    def forward(self, input):
+        return self.functional(input)
+
+
+class _GlobalStatsNormalization(torch.nn.Module):
+    def __init__(self, global_stats_path):
+        super().__init__()
+
+        with open(global_stats_path) as f:
+            blob = json.loads(f.read())
+
+        self.register_buffer("mean", torch.tensor(blob["mean"]))
+        self.register_buffer("invstddev", torch.tensor(blob["invstddev"]))
+
+    def forward(self, input):
+        return (input - self.mean) * self.invstddev
+
+
+class _FeatureExtractor(ABC):
+    @abstractmethod
+    def __call__(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Generates features and length output from the given input tensor.
+
+        Args:
+            input (torch.Tensor): input tensor.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+            torch.Tensor:
+                Features, with shape `(length, *)`.
+            torch.Tensor:
+                Length, with shape `(1,)`.
+        """
+
+
+class _TokenProcessor(ABC):
+    @abstractmethod
+    def __call__(self, tokens: List[int], **kwargs) -> str:
+        """Decodes given list of tokens to text sequence.
+
+        Args:
+            tokens (List[int]): list of tokens to decode.
+
+        Returns:
+            str:
+                Decoded text sequence.
+        """
+
+
+class _ModuleFeatureExtractor(torch.nn.Module, _FeatureExtractor):
+    """``torch.nn.Module``-based feature extraction pipeline.
+
+    Args:
+        pipeline (torch.nn.Module): module that implements feature extraction logic.
+    """
+
+    def __init__(self, pipeline: torch.nn.Module) -> None:
+        super().__init__()
+        self.pipeline = pipeline
+
+    def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Generates features and length output from the given input tensor.
+
+        Args:
+            input (torch.Tensor): input tensor.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+            torch.Tensor:
+                Features, with shape `(length, *)`.
+            torch.Tensor:
+                Length, with shape `(1,)`.
+        """
+        features = self.pipeline(input)
+        length = torch.tensor([features.shape[0]])
+        return features, length
+
+
+class _SentencePieceTokenProcessor(_TokenProcessor):
+    """SentencePiece-model-based token processor.
+
+    Args:
+        sp_model_path (str): path to SentencePiece model.
+    """
+
+    def __init__(self, sp_model_path: str) -> None:
+        if not module_utils.is_module_available("sentencepiece"):
+            raise RuntimeError("SentencePiece is not available. Please install it.")
+
+        import sentencepiece as spm
+
+        self.sp_model = spm.SentencePieceProcessor(model_file=sp_model_path)
+        self.post_process_remove_list = {
+            self.sp_model.unk_id(),
+            self.sp_model.eos_id(),
+            self.sp_model.pad_id(),
+        }
+
+    def __call__(self, tokens: List[int], lstrip: bool = True) -> str:
+        """Decodes given list of tokens to text sequence.
+
+        Args:
+            tokens (List[int]): list of tokens to decode.
+            lstrip (bool, optional): if ``True``, returns text sequence with leading whitespace
+                removed. (Default: ``True``).
+
+        Returns:
+            str:
+                Decoded text sequence.
+        """
+        filtered_hypo_tokens = [
+            token_index for token_index in tokens[1:] if token_index not in self.post_process_remove_list
+        ]
+        output_string = "".join(self.sp_model.id_to_piece(filtered_hypo_tokens)).replace("\u2581", " ")
+
+        if lstrip:
+            return output_string.lstrip()
+        else:
+            return output_string
+
+
+@dataclass
+class RNNTBundle:
+    """Dataclass that bundles components for performing automatic speech recognition (ASR, speech-to-text)
+    inference with an RNN-T model.
+
+    More specifically, the class provides methods that produce the featurization pipeline,
+    decoder wrapping the specified RNN-T model, and output token post-processor that together
+    constitute a complete end-to-end ASR inference pipeline that produces a text sequence
+    given a raw waveform.
+
+    It can support non-streaming (full-context) inference as well as streaming inference.
+
+    Users should not directly instantiate objects of this class; rather, users should use the
+    instances (representing pre-trained models) that exist within the module,
+    e.g. :data:`torchaudio.pipelines.EMFORMER_RNNT_BASE_LIBRISPEECH`.
+
+    Example
+        >>> import torchaudio
+        >>> from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
+        >>> import torch
+        >>>
+        >>> # Non-streaming inference.
+        >>> # Build feature extractor, decoder with RNN-T model, and token processor.
+        >>> feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_feature_extractor()
+        100%|███████████████████████████████| 3.81k/3.81k [00:00<00:00, 4.22MB/s]
+        >>> decoder = EMFORMER_RNNT_BASE_LIBRISPEECH.get_decoder()
+        Downloading: "https://download.pytorch.org/torchaudio/models/emformer_rnnt_base_librispeech.pt"
+        100%|███████████████████████████████| 293M/293M [00:07<00:00, 42.1MB/s]
+        >>> token_processor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_token_processor()
+        100%|███████████████████████████████| 295k/295k [00:00<00:00, 25.4MB/s]
+        >>>
+        >>> # Instantiate LibriSpeech dataset; retrieve waveform for first sample.
+        >>> dataset = torchaudio.datasets.LIBRISPEECH("/home/librispeech", url="test-clean")
+        >>> waveform = next(iter(dataset))[0].squeeze()
+        >>>
+        >>> with torch.no_grad():
+        >>>     # Produce mel-scale spectrogram features.
+        >>>     features, length = feature_extractor(waveform)
+        >>>
+        >>>     # Generate top-10 hypotheses.
+        >>>     hypotheses = decoder(features, length, 10)
+        >>>
+        >>> # For top hypothesis, convert predicted tokens to text.
+        >>> text = token_processor(hypotheses[0][0])
+        >>> print(text)
+        he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to [...]
+        >>>
+        >>>
+        >>> # Streaming inference.
+        >>> hop_length = EMFORMER_RNNT_BASE_LIBRISPEECH.hop_length
+        >>> num_samples_segment = EMFORMER_RNNT_BASE_LIBRISPEECH.segment_length * hop_length
+        >>> num_samples_segment_right_context = (
+        >>>     num_samples_segment + EMFORMER_RNNT_BASE_LIBRISPEECH.right_context_length * hop_length
+        >>> )
+        >>>
+        >>> # Build streaming inference feature extractor.
+        >>> streaming_feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_streaming_feature_extractor()
+        >>>
+        >>> # Process same waveform as before, this time sequentially across overlapping segments
+        >>> # to simulate streaming inference. Note the usage of ``streaming_feature_extractor`` and ``decoder.infer``.
+        >>> state, hypothesis = None, None
+        >>> for idx in range(0, len(waveform), num_samples_segment):
+        >>>     segment = waveform[idx: idx + num_samples_segment_right_context]
+        >>>     segment = torch.nn.functional.pad(segment, (0, num_samples_segment_right_context - len(segment)))
+        >>>     with torch.no_grad():
+        >>>         features, length = streaming_feature_extractor(segment)
+        >>>         hypotheses, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
+        >>>     hypothesis = hypotheses[0]
+        >>>     transcript = token_processor(hypothesis[0])
+        >>>     if transcript:
+        >>>         print(transcript, end=" ", flush=True)
+        he hoped there would be stew for dinner turn ips and car rots and bru 'd oes and fat mut ton pieces to [...]
+    """
+
+    class FeatureExtractor(_FeatureExtractor):
+        """Interface of the feature extraction part of RNN-T pipeline"""
+
+    class TokenProcessor(_TokenProcessor):
+        """Interface of the token processor part of RNN-T pipeline"""
+
+    _rnnt_path: str
+    _rnnt_factory_func: Callable[[], RNNT]
+    _global_stats_path: str
+    _sp_model_path: str
+    _right_padding: int
+    _blank: int
+    _sample_rate: int
+    _n_fft: int
+    _n_mels: int
+    _hop_length: int
+    _segment_length: int
+    _right_context_length: int
+
+    def _get_model(self) -> RNNT:
+        model = self._rnnt_factory_func()
+        path = torchaudio.utils.download_asset(self._rnnt_path)
+        state_dict = torch.load(path)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+    @property
+    def sample_rate(self) -> int:
+        """Sample rate (in cycles per second) of input waveforms.
+
+        :type: int
+        """
+        return self._sample_rate
+
+    @property
+    def n_fft(self) -> int:
+        """Size of FFT window to use.
+
+        :type: int
+        """
+        return self._n_fft
+
+    @property
+    def n_mels(self) -> int:
+        """Number of mel spectrogram features to extract from input waveforms.
+
+        :type: int
+        """
+        return self._n_mels
+
+    @property
+    def hop_length(self) -> int:
+        """Number of samples between successive frames in input expected by model.
+
+        :type: int
+        """
+        return self._hop_length
+
+    @property
+    def segment_length(self) -> int:
+        """Number of frames in segment in input expected by model.
+
+        :type: int
+        """
+        return self._segment_length
+
+    @property
+    def right_context_length(self) -> int:
+        """Number of frames in right contextual block in input expected by model.
+
+        :type: int
+        """
+        return self._right_context_length
+
+    def get_decoder(self) -> RNNTBeamSearch:
+        """Constructs RNN-T decoder.
+
+        Returns:
+            RNNTBeamSearch
+        """
+        model = self._get_model()
+        return RNNTBeamSearch(model, self._blank)
+
+    def get_feature_extractor(self) -> FeatureExtractor:
+        """Constructs feature extractor for non-streaming (full-context) ASR.
+
+        Returns:
+            FeatureExtractor
+        """
+        local_path = torchaudio.utils.download_asset(self._global_stats_path)
+        return _ModuleFeatureExtractor(
+            torch.nn.Sequential(
+                torchaudio.transforms.MelSpectrogram(
+                    sample_rate=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, hop_length=self.hop_length
+                ),
+                _FunctionalModule(lambda x: x.transpose(1, 0)),
+                _FunctionalModule(lambda x: _piecewise_linear_log(x * _gain)),
+                _GlobalStatsNormalization(local_path),
+                _FunctionalModule(lambda x: torch.nn.functional.pad(x, (0, 0, 0, self._right_padding))),
+            )
+        )
+
+    def get_streaming_feature_extractor(self) -> FeatureExtractor:
+        """Constructs feature extractor for streaming (simultaneous) ASR.
+
+        Returns:
+            FeatureExtractor
+        """
+        local_path = torchaudio.utils.download_asset(self._global_stats_path)
+        return _ModuleFeatureExtractor(
+            torch.nn.Sequential(
+                torchaudio.transforms.MelSpectrogram(
+                    sample_rate=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, hop_length=self.hop_length
+                ),
+                _FunctionalModule(lambda x: x.transpose(1, 0)),
+                _FunctionalModule(lambda x: _piecewise_linear_log(x * _gain)),
+                _GlobalStatsNormalization(local_path),
+            )
+        )
+
+    def get_token_processor(self) -> TokenProcessor:
+        """Constructs token processor.
+
+        Returns:
+            TokenProcessor
+        """
+        local_path = torchaudio.utils.download_asset(self._sp_model_path)
+        return _SentencePieceTokenProcessor(local_path)
+
+
+EMFORMER_RNNT_BASE_LIBRISPEECH = RNNTBundle(
+    _rnnt_path="models/emformer_rnnt_base_librispeech.pt",
+    _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=4097),
+    _global_stats_path="pipeline-assets/global_stats_rnnt_librispeech.json",
+    _sp_model_path="pipeline-assets/spm_bpe_4096_librispeech.model",
+    _right_padding=4,
+    _blank=4096,
+    _sample_rate=16000,
+    _n_fft=400,
+    _n_mels=80,
+    _hop_length=160,
+    _segment_length=16,
+    _right_context_length=4,
+)
+EMFORMER_RNNT_BASE_LIBRISPEECH.__doc__ = """ASR pipeline based on Emformer-RNNT,
+pretrained on *LibriSpeech* dataset :cite:`7178964`,
+capable of performing both streaming and non-streaming inference.
+
+The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
+and utilizes weights trained on LibriSpeech using training script ``train.py``
+`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with default arguments.
+
+Please refer to :py:class:`RNNTBundle` for usage instructions.
+"""
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/__init__.py b/MLPY/Lib/site-packages/torchaudio/prototype/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ef33317053f3989a3590c081785439c031f103e
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/datasets/__init__.py b/MLPY/Lib/site-packages/torchaudio/prototype/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66123a85ed4d8dd27e0eedaa8a0c8e5552da6348
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/datasets/__init__.py
@@ -0,0 +1,4 @@
+from .musan import Musan
+
+
+__all__ = ["Musan"]
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/datasets/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/datasets/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69e89fcaa26f3c4b7f744684089177b62252dc48
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/datasets/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/datasets/__pycache__/musan.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/datasets/__pycache__/musan.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5c7b7f7e7e85e86be3e25526974cf392d13dca9
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/datasets/__pycache__/musan.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/datasets/musan.py b/MLPY/Lib/site-packages/torchaudio/prototype/datasets/musan.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e8fe3d6a1342378c70108cad7679d8fd64b7c7a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/datasets/musan.py
@@ -0,0 +1,67 @@
+from pathlib import Path
+from typing import Tuple, Union
+
+import torch
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import _load_waveform
+
+
+_SUBSETS = ["music", "noise", "speech"]
+_SAMPLE_RATE = 16_000
+
+
+class Musan(Dataset):
+    r"""*MUSAN* :cite:`musan2015` dataset.
+
+    Args:
+        root (str or Path): Root directory where the dataset's top-level directory exists.
+        subset (str): Subset of the dataset to use. Options: [``"music"``, ``"noise"``, ``"speech"``].
+    """
+
+    def __init__(self, root: Union[str, Path], subset: str):
+        if subset not in _SUBSETS:
+            raise ValueError(f"Invalid subset '{subset}' given. Please provide one of {_SUBSETS}")
+
+        subset_path = Path(root) / subset
+        self._walker = [str(p) for p in subset_path.glob("*/*.*")]
+
+    def get_metadata(self, n: int) -> Tuple[str, int, str]:
+        r"""Get metadata for the n-th sample in the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+
+        Args:
+            n (int): Index of sample to be loaded.
+
+        Returns:
+            (str, int, str):
+                str
+                    Path to audio.
+                int
+                    Sample rate.
+                str
+                    File name.
+        """
+        audio_path = self._walker[n]
+        return audio_path, _SAMPLE_RATE, Path(audio_path).name
+
+    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
+        r"""Return the n-th sample in the dataset.
+
+        Args:
+            n (int): Index of sample to be loaded.
+
+        Returns:
+            (torch.Tensor, int, str):
+                torch.Tensor
+                    Waveform.
+                int
+                    Sample rate.
+                str
+                    File name.
+        """
+        audio_path, sample_rate, filename = self.get_metadata(n)
+        path = Path(audio_path)
+        return _load_waveform(path.parent, path.name, sample_rate), sample_rate, filename
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/functional/__init__.py b/MLPY/Lib/site-packages/torchaudio/prototype/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d7daa6a6069c0caca0b191fe174af78e1dab72
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/functional/__init__.py
@@ -0,0 +1,26 @@
+from ._dsp import (
+    adsr_envelope,
+    exp_sigmoid,
+    extend_pitch,
+    filter_waveform,
+    frequency_impulse_response,
+    oscillator_bank,
+    sinc_impulse_response,
+)
+from ._rir import ray_tracing, simulate_rir_ism
+from .functional import barkscale_fbanks, chroma_filterbank
+
+
+__all__ = [
+    "adsr_envelope",
+    "exp_sigmoid",
+    "barkscale_fbanks",
+    "chroma_filterbank",
+    "extend_pitch",
+    "filter_waveform",
+    "frequency_impulse_response",
+    "oscillator_bank",
+    "ray_tracing",
+    "sinc_impulse_response",
+    "simulate_rir_ism",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3da9ea9b38294448a38ba8ad09359a1b25680e22
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/_dsp.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/_dsp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..945b0aae5e8f2bd733a02998935fd3c4be9d8702
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/_dsp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/_rir.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/_rir.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63922e59535d22d938464f59ce12907b2ec7395d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/_rir.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/functional.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a521c7e98378dc3a167ba2db1ea5d3c8bbb1a2e7
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/functional/__pycache__/functional.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/functional/_dsp.py b/MLPY/Lib/site-packages/torchaudio/prototype/functional/_dsp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c590374b3fe8ce58479ed0c0388a551ad8765004
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/functional/_dsp.py
@@ -0,0 +1,433 @@
+import warnings
+from typing import List, Optional, Union
+
+import torch
+
+from torchaudio.functional import fftconvolve
+
+
+def oscillator_bank(
+    frequencies: torch.Tensor,
+    amplitudes: torch.Tensor,
+    sample_rate: float,
+    reduction: str = "sum",
+    dtype: Optional[torch.dtype] = torch.float64,
+) -> torch.Tensor:
+    """Synthesize waveform from the given instantaneous frequencies and amplitudes.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        The phase information of the output waveform is found by taking the cumulative sum
+        of the given instantaneous frequencies (``frequencies``).
+        This incurs roundoff error when the data type does not have enough precision.
+        Using ``torch.float64`` can work around this.
+
+        The following figure shows the difference between ``torch.float32`` and
+        ``torch.float64`` when generating a sin wave of constant frequency and amplitude
+        with sample rate 8000 [Hz].
+        Notice that ``torch.float32`` version shows artifacts that are not seen in
+        ``torch.float64`` version.
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/oscillator_precision.png
+
+    Args:
+        frequencies (Tensor): Sample-wise oscillator frequencies (Hz). Shape `(..., time, N)`.
+        amplitudes (Tensor): Sample-wise oscillator amplitude. Shape: `(..., time, N)`.
+        sample_rate (float): Sample rate
+        reduction (str): Reduction to perform.
+            Valid values are ``"sum"``, ``"mean"`` or ``"none"``. Default: ``"sum"``
+        dtype (torch.dtype or None, optional): The data type on which cumulative sum operation is performed.
+            Default: ``torch.float64``. Pass ``None`` to disable the casting.
+
+    Returns:
+        Tensor:
+            The resulting waveform.
+
+            If ``reduction`` is ``"none"``, then the shape is
+            `(..., time, N)`, otherwise the shape is `(..., time)`.
+    """
+    if frequencies.shape != amplitudes.shape:
+        raise ValueError(
+            "The shapes of `frequencies` and `amplitudes` must match. "
+            f"Found: {frequencies.shape} and {amplitudes.shape} respectively."
+        )
+    reductions = ["sum", "mean", "none"]
+    if reduction not in reductions:
+        raise ValueError(f"The value of reduction must be either {reductions}. Found: {reduction}")
+
+    invalid = torch.abs(frequencies) >= sample_rate / 2
+    if torch.any(invalid):
+        warnings.warn(
+            "Some frequencies are above nyquist frequency. "
+            "Setting the corresponding amplitude to zero. "
+            "This might cause numerically unstable gradient."
+        )
+        amplitudes = torch.where(invalid, 0.0, amplitudes)
+
+    pi2 = 2.0 * torch.pi
+    freqs = frequencies * pi2 / sample_rate % pi2
+    phases = torch.cumsum(freqs, dim=-2, dtype=dtype)
+    if dtype is not None and freqs.dtype != dtype:
+        phases = phases.to(freqs.dtype)
+
+    waveform = amplitudes * torch.sin(phases)
+    if reduction == "sum":
+        return waveform.sum(-1)
+    if reduction == "mean":
+        return waveform.mean(-1)
+    return waveform
+
+
+def adsr_envelope(
+    num_frames: int,
+    *,
+    attack: float = 0.0,
+    hold: float = 0.0,
+    decay: float = 0.0,
+    sustain: float = 1.0,
+    release: float = 0.0,
+    n_decay: int = 2,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+):
+    """Generate ADSR Envelope
+
+    .. devices:: CPU CUDA
+
+    Args:
+        num_frames (int): The number of output frames.
+        attack (float, optional):
+            The relative *time* it takes to reach the maximum level from
+            the start. (Default: ``0.0``)
+        hold (float, optional):
+            The relative *time* the maximum level is held before
+            it starts to decay. (Default: ``0.0``)
+        decay (float, optional):
+            The relative *time* it takes to sustain from
+            the maximum level. (Default: ``0.0``)
+        sustain (float, optional): The relative *level* at which
+            the sound should sustain. (Default: ``1.0``)
+
+            .. Note::
+               The duration of sustain is derived as `1.0 - (The sum of attack, hold, decay and release)`.
+
+        release (float, optional): The relative *time* it takes for the sound level to
+            reach zero after the sustain. (Default: ``0.0``)
+        n_decay (int, optional): The degree of polynomial decay. Default: ``2``.
+        dtype (torch.dtype, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default
+            (see :py:func:`torch.set_default_tensor_type`).
+        device (torch.device, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :py:func:`torch.set_default_tensor_type`).
+            device will be the CPU for CPU tensor types and the current CUDA
+            device for CUDA tensor types.
+
+    Returns:
+        Tensor: ADSR Envelope. Shape: `(num_frames, )`
+
+    Example
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/adsr_examples.png
+
+    """
+    if not 0 <= attack <= 1:
+        raise ValueError(f"The value of `attack` must be within [0, 1]. Found: {attack}")
+    if not 0 <= decay <= 1:
+        raise ValueError(f"The value of `decay` must be within [0, 1]. Found: {decay}")
+    if not 0 <= sustain <= 1:
+        raise ValueError(f"The value of `sustain` must be within [0, 1]. Found: {sustain}")
+    if not 0 <= hold <= 1:
+        raise ValueError(f"The value of `hold` must be within [0, 1]. Found: {hold}")
+    if not 0 <= release <= 1:
+        raise ValueError(f"The value of `release` must be within [0, 1]. Found: {release}")
+    if attack + decay + release + hold > 1:
+        raise ValueError("The sum of `attack`, `hold`, `decay` and `release` must not exceed 1.")
+
+    nframes = num_frames - 1
+    num_a = int(nframes * attack)
+    num_h = int(nframes * hold)
+    num_d = int(nframes * decay)
+    num_r = int(nframes * release)
+
+    # Initialize with sustain
+    out = torch.full((num_frames,), float(sustain), device=device, dtype=dtype)
+
+    # attack
+    if num_a > 0:
+        torch.linspace(0.0, 1.0, num_a + 1, out=out[: num_a + 1])
+
+    # hold
+    if num_h > 0:
+        out[num_a : num_a + num_h + 1] = 1.0
+
+    # decay
+    if num_d > 0:
+        # Compute: sustain + (1.0 - sustain) * (linspace[1, 0] ** n_decay)
+        i = num_a + num_h
+        decay = out[i : i + num_d + 1]
+        torch.linspace(1.0, 0.0, num_d + 1, out=decay)
+        decay **= n_decay
+        decay *= 1.0 - sustain
+        decay += sustain
+
+    # sustain is handled by initialization
+
+    # release
+    if num_r > 0:
+        torch.linspace(sustain, 0, num_r + 1, out=out[-num_r - 1 :])
+
+    return out
+
+
+def extend_pitch(
+    base: torch.Tensor,
+    pattern: Union[int, List[float], torch.Tensor],
+):
+    """Extend the given time series values with multipliers of them.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given a series of fundamental frequencies (pitch), this function appends
+    its harmonic overtones or inharmonic partials.
+
+    Args:
+        base (torch.Tensor):
+            Base time series, like fundamental frequencies (Hz). Shape: `(..., time, 1)`.
+        pattern (int, list of floats or torch.Tensor):
+            If ``int``, the number of pitch series after the operation.
+            `pattern - 1` tones are added, so that the resulting Tensor contains
+            up to `pattern`-th overtones of the given series.
+
+            If list of float or ``torch.Tensor``, it must be one dimensional,
+            representing the custom multiplier of the fundamental frequency.
+
+    Returns:
+        Tensor: Oscillator frequencies (Hz). Shape: `(..., time, num_tones)`.
+
+    Example
+        >>> # fundamental frequency
+        >>> f0 = torch.linspace(1, 5, 5).unsqueeze(-1)
+        >>> f0
+        tensor([[1.],
+                [2.],
+                [3.],
+                [4.],
+                [5.]])
+        >>> # Add harmonic overtones, up to 3rd.
+        >>> f = extend_pitch(f0, 3)
+        >>> f.shape
+        torch.Size([5, 3])
+        >>> f
+        tensor([[ 1.,  2.,  3.],
+                [ 2.,  4.,  6.],
+                [ 3.,  6.,  9.],
+                [ 4.,  8., 12.],
+                [ 5., 10., 15.]])
+        >>> # Add custom (inharmonic) partials.
+        >>> f = extend_pitch(f0, torch.tensor([1, 2.1, 3.3, 4.5]))
+        >>> f.shape
+        torch.Size([5, 4])
+        >>> f
+        tensor([[ 1.0000,  2.1000,  3.3000,  4.5000],
+                [ 2.0000,  4.2000,  6.6000,  9.0000],
+                [ 3.0000,  6.3000,  9.9000, 13.5000],
+                [ 4.0000,  8.4000, 13.2000, 18.0000],
+                [ 5.0000, 10.5000, 16.5000, 22.5000]])
+    """
+    if isinstance(pattern, torch.Tensor):
+        mult = pattern
+    elif isinstance(pattern, int):
+        mult = torch.linspace(1.0, float(pattern), pattern, device=base.device, dtype=base.dtype)
+    else:
+        mult = torch.tensor(pattern, dtype=base.dtype, device=base.device)
+    h_freq = base @ mult.unsqueeze(0)
+    return h_freq
+
+
+def sinc_impulse_response(cutoff: torch.Tensor, window_size: int = 513, high_pass: bool = False):
+    """Create windowed-sinc impulse response for given cutoff frequencies.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        cutoff (Tensor): Cutoff frequencies for low-pass sinc filter.
+
+        window_size (int, optional): Size of the Hamming window to apply. Must be odd.
+        (Default: 513)
+
+        high_pass (bool, optional):
+            If ``True``, convert the resulting filter to high-pass.
+            Otherwise low-pass filter is returned. Default: ``False``.
+
+    Returns:
+        Tensor: A series of impulse responses. Shape: `(..., window_size)`.
+    """
+    if window_size % 2 == 0:
+        raise ValueError(f"`window_size` must be odd. Given: {window_size}")
+
+    half = window_size // 2
+    device, dtype = cutoff.device, cutoff.dtype
+    idx = torch.linspace(-half, half, window_size, device=device, dtype=dtype)
+
+    filt = torch.special.sinc(cutoff.unsqueeze(-1) * idx.unsqueeze(0))
+    filt = filt * torch.hamming_window(window_size, device=device, dtype=dtype, periodic=False).unsqueeze(0)
+    filt = filt / filt.sum(dim=-1, keepdim=True).abs()
+
+    # High pass IR is obtained by subtracting low_pass IR from delta function.
+    # https://courses.engr.illinois.edu/ece401/fa2020/slides/lec10.pdf
+    if high_pass:
+        filt = -filt
+        filt[..., half] = 1.0 + filt[..., half]
+    return filt
+
+
+def frequency_impulse_response(magnitudes):
+    """Create filter from desired frequency response
+
+    Args:
+        magnitudes: The desired frequency responses. Shape: `(..., num_fft_bins)`
+
+    Returns:
+        Tensor: Impulse response. Shape `(..., 2 * (num_fft_bins - 1))`
+    """
+    if magnitudes.min() < 0.0:
+        # Negative magnitude does not make sense but allowing so that autograd works
+        # around 0.
+        # Should we raise error?
+        warnings.warn("The input frequency response should not contain negative values.")
+    ir = torch.fft.fftshift(torch.fft.irfft(magnitudes), dim=-1)
+    device, dtype = magnitudes.device, magnitudes.dtype
+    window = torch.hann_window(ir.size(-1), periodic=False, device=device, dtype=dtype).expand_as(ir)
+    return ir * window
+
+
+def _overlap_and_add(waveform, stride):
+    num_frames, frame_size = waveform.shape[-2:]
+    numel = (num_frames - 1) * stride + frame_size
+    buffer = torch.zeros(waveform.shape[:-2] + (numel,), device=waveform.device, dtype=waveform.dtype)
+    for i in range(num_frames):
+        start = i * stride
+        end = start + frame_size
+        buffer[..., start:end] += waveform[..., i, :]
+    return buffer
+
+
+def filter_waveform(waveform: torch.Tensor, kernels: torch.Tensor, delay_compensation: int = -1):
+    """Applies filters along time axis of the given waveform.
+
+    This function applies the given filters along time axis in the following manner:
+
+    1. Split the given waveform into chunks. The number of chunks is equal to the number of given filters.
+    2. Filter each chunk with corresponding filter.
+    3. Place the filtered chunks at the original indices while adding up the overlapping parts.
+    4. Crop the resulting waveform so that delay introduced by the filter is removed and its length
+       matches that of the input waveform.
+
+    The following figure illustrates this.
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/filter_waveform.png
+
+    .. note::
+
+       If the number of filters is one, then the operation becomes stationary.
+       i.e. the same filtering is applied across the time axis.
+
+    Args:
+        waveform (Tensor): Shape `(..., time)`.
+        kernels (Tensor): Impulse responses.
+            Valid inputs are 2D tensor with shape `(num_filters, filter_length)` or
+            `(N+1)`-D tensor with shape `(..., num_filters, filter_length)`, where `N` is
+            the dimension of waveform.
+
+            In case of 2D input, the same set of filters is used across channels and batches.
+            Otherwise, different sets of filters are applied. In this case, the shape of
+            the first `N-1` dimensions of filters must match (or be broadcastable to) that of waveform.
+
+        delay_compensation (int): Control how the waveform is cropped after full convolution.
+            If the value is zero or positive, it is interpreted as the length of crop at the
+            beginning of the waveform. The value cannot be larger than the size of filter kernel.
+            Otherwise the initial crop is ``filter_size // 2``.
+            When cropping happens, the waveform is also cropped from the end so that the
+            length of the resulting waveform matches the input waveform.
+
+    Returns:
+        Tensor: `(..., time)`.
+    """
+    if kernels.ndim not in [2, waveform.ndim + 1]:
+        raise ValueError(
+            "`kernels` must be 2 or N+1 dimension where "
+            f"N is the dimension of waveform. Found: {kernels.ndim} (N={waveform.ndim})"
+        )
+
+    num_filters, filter_size = kernels.shape[-2:]
+    num_frames = waveform.size(-1)
+
+    if delay_compensation > filter_size:
+        raise ValueError(
+            "When `delay_compenstation` is provided, it cannot be larger than the size of filters."
+            f"Found: delay_compensation={delay_compensation}, filter_size={filter_size}"
+        )
+
+    # Transform waveform's time axis into (num_filters x chunk_length) with optional padding
+    chunk_length = num_frames // num_filters
+    if num_frames % num_filters > 0:
+        chunk_length += 1
+        num_pad = chunk_length * num_filters - num_frames
+        waveform = torch.nn.functional.pad(waveform, [0, num_pad], "constant", 0)
+    chunked = waveform.unfold(-1, chunk_length, chunk_length)
+    assert chunked.numel() >= waveform.numel()
+
+    # Broadcast kernels
+    if waveform.ndim + 1 > kernels.ndim:
+        expand_shape = waveform.shape[:-1] + kernels.shape
+        kernels = kernels.expand(expand_shape)
+
+    convolved = fftconvolve(chunked, kernels)
+    restored = _overlap_and_add(convolved, chunk_length)
+
+    # Trim in a way that the number of samples are same as input,
+    # and the filter delay is compensated
+    if delay_compensation >= 0:
+        start = delay_compensation
+    else:
+        start = filter_size // 2
+    num_crops = restored.size(-1) - num_frames
+    end = num_crops - start
+    result = restored[..., start:-end]
+    return result
+
+
+def exp_sigmoid(
+    input: torch.Tensor, exponent: float = 10.0, max_value: float = 2.0, threshold: float = 1e-7
+) -> torch.Tensor:
+    """Exponential Sigmoid pointwise nonlinearity.
+    Implements the equation:
+    ``max_value`` * sigmoid(``input``) ** (log(``exponent``)) + ``threshold``
+
+    The output has a range of [``threshold``, ``max_value``].
+    ``exponent`` controls the slope of the output.
+
+    .. devices:: CPU CUDA
+
+    Args:
+        input (Tensor): Input Tensor
+        exponent (float, optional): Exponent. Controls the slope of the output
+        max_value (float, optional): Maximum value of the output
+        threshold (float, optional): Minimum value of the output
+
+    Returns:
+        Tensor: Exponential Sigmoid output. Shape: same as input
+
+    """
+
+    return max_value * torch.pow(
+        torch.nn.functional.sigmoid(input),
+        torch.log(torch.tensor(exponent, device=input.device, dtype=input.dtype)),
+    ) + torch.tensor(threshold, device=input.device, dtype=input.dtype)
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/functional/_rir.py b/MLPY/Lib/site-packages/torchaudio/prototype/functional/_rir.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6c54974e4ee9396c8f76bd6fec9ceb1992e4426
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/functional/_rir.py
@@ -0,0 +1,379 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torchaudio
+from torch import Tensor
+
+
+def _compute_image_sources(
+    room: torch.Tensor,
+    source: torch.Tensor,
+    max_order: int,
+    absorption: torch.Tensor,
+    scatter: Optional[torch.Tensor] = None,
+) -> Tuple[Tensor, Tensor]:
+    """Compute image sources in a shoebox-like room.
+
+    Args:
+        room (torch.Tensor): The 1D Tensor to determine the room size. The shape is
+            `(D,)`, where ``D`` is 2 if room is a 2D room, or 3 if room is a 3D room.
+        source (torch.Tensor): The coordinate of the sound source. Tensor with dimensions
+            `(D)`.
+        max_order (int): The maximum number of reflections of the source.
+        absorption (torch.Tensor): The absorption coefficients of wall materials.
+            ``absorption`` is a Tensor with dimensions `(num_band, num_wall)`.
+            The shape options are ``[(1, 4), (1, 6), (7, 4), (7, 6)]``.
+            ``num_band`` is `1` if the coefficients is the same for all frequencies, or is `7`
+            if the coefficients are different to different frequencies. `7` refers to the default number
+            of octave bands. (See note in `simulate_rir_ism` method).
+            ``num_wall`` is `4` if the room is a 2D room, representing absorption coefficients
+            of ``"west"``, ``"east"``, ``"south"``, and ``"north"`` walls, respectively.
+            Or it is `6` if the room is a 3D room, representing absorption coefficients
+            of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
+        scatter (torch.Tensor): The scattering coefficients of wall materials.
+            The shape of ``scatter`` must match that of ``absorption``. If ``None``, it is not
+            used in image source computation. (Default: ``None``)
+
+    Returns:
+        (torch.Tensor): The coordinates of all image sources within ``max_order`` number of reflections.
+            Tensor with dimensions `(num_image_source, D)`.
+        (torch.Tensor): The attenuation of corresponding image sources. Tensor with dimensions
+            `(num_band, num_image_source)`.
+    """
+    if scatter is None:
+        tr = torch.sqrt(1 - absorption)
+    else:
+        tr = torch.sqrt(1 - absorption) * torch.sqrt(1 - scatter)
+
+    ind = torch.arange(-max_order, max_order + 1, device=source.device)
+    if room.shape[0] == 2:
+        XYZ = torch.meshgrid(ind, ind, indexing="ij")
+    else:
+        XYZ = torch.meshgrid(ind, ind, ind, indexing="ij")
+    XYZ = torch.stack([c.reshape((-1,)) for c in XYZ], dim=-1)
+    XYZ = XYZ[XYZ.abs().sum(dim=-1) <= max_order]
+
+    # compute locations of image sources
+    d = room[None, :]
+    s = source[None, :]
+    img_loc = torch.where(XYZ % 2 == 1, d * (XYZ + 1) - s, d * XYZ + s)
+
+    # attenuation
+    exp_lo = abs(torch.floor((XYZ / 2)))
+    exp_hi = abs(torch.floor((XYZ + 1) / 2))
+    t_lo = tr[:, ::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1)  # (num_band, left walls)
+    t_hi = tr[:, 1::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1)  # (num_band, right walls)
+    att = torch.prod((t_lo**exp_lo) * (t_hi**exp_hi), dim=-1)  # (num_band, num_image_source)
+    return img_loc, att
+
+
+def _hann(x: torch.Tensor, T: int):
+    """Compute the Hann window where the values are truncated based on window length.
+    torch.hann_window can only sample window function at integer points, the method is to sample
+    continuous window function at non-integer points.
+
+    Args:
+        x (torch.Tensor): The fractional component of time delay Tensor.
+        T (torch.Tensor): The window length of sinc function.
+
+    Returns:
+        (torch.Tensor): The hann window Tensor where values outside
+            the sinc window (`T`) is set to zero.
+    """
+    y = torch.where(
+        torch.abs(x) <= T / 2,
+        0.5 * (1 + torch.cos(2 * math.pi * x / T)),
+        x.new_zeros(1),
+    )
+    return y
+
+
+def _frac_delay(delay: torch.Tensor, delay_i: torch.Tensor, delay_filter_length: int):
+    """Compute fractional delay of impulse response signal.
+
+    Args:
+        delay (torch.Tensor): The time delay Tensor in samples.
+        delay_i (torch.Tensor): The integer part of delay.
+        delay_filter_length (int): The window length for sinc function.
+
+    Returns:
+        (torch.Tensor): The impulse response Tensor for all image sources.
+    """
+    if delay_filter_length % 2 != 1:
+        raise ValueError("The filter length must be odd")
+
+    pad = delay_filter_length // 2
+    n = torch.arange(-pad, pad + 1, device=delay.device) + delay_i[..., None]
+    delay = delay[..., None]
+
+    return torch.special.sinc(n - delay) * _hann(n - delay, 2 * pad)
+
+
+def _adjust_coeff(coeffs: Union[float, torch.Tensor], name: str) -> torch.Tensor:
+    """Validates and converts absorption or scattering parameters to a tensor with appropriate shape
+
+    Args:
+        coeff (float or torch.Tensor): The absorption coefficients of wall materials.
+
+            If the dtype is ``float``, the absorption coefficient is identical for all walls and
+            all frequencies.
+
+            If ``absorption`` is a 1D Tensor, the shape must be `(2*dim,)`,
+            where the values represent absorption coefficients of ``"west"``, ``"east"``,
+            ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
+
+            If ``absorption`` is a 2D Tensor, the shape must be `(7, 2*dim)`,
+            where 7 represents the number of octave bands.
+
+    Returns:
+        (torch.Tensor): The expanded coefficient.
+            The shape is `(1, 6)` for single octave band case, and
+            `(7, 6)` for multi octave band case.
+    """
+    num_walls = 6
+    if isinstance(coeffs, float):
+        if coeffs < 0:
+            raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
+        return torch.full((1, num_walls), coeffs)
+    if isinstance(coeffs, Tensor):
+        if torch.any(coeffs < 0):
+            raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
+        if coeffs.ndim == 1:
+            if coeffs.numel() != num_walls:
+                raise ValueError(
+                    f"The shape of `{name}` must be ({num_walls},) when it is a 1D Tensor. "
+                    f"Found the shape {coeffs.shape}."
+                )
+            return coeffs.unsqueeze(0)
+        if coeffs.ndim == 2:
+            if coeffs.shape[1] != num_walls:
+                raise ValueError(
+                    f"The shape of `{name}` must be (NUM_BANDS, {num_walls}) when it "
+                    f"is a 2D Tensor. Found: {coeffs.shape}."
+                )
+            return coeffs
+    raise TypeError(f"`{name}` must be float or Tensor.")
+
+
+def _validate_inputs(
+    room: torch.Tensor,
+    source: torch.Tensor,
+    mic_array: torch.Tensor,
+):
+    """Validate dimensions of input arguments, and normalize different kinds of absorption into the same dimension.
+
+    Args:
+        room (torch.Tensor): The size of the room. width, length (and height)
+        source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(dim,)`.
+        mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, dim)`.
+    """
+    if not (room.ndim == 1 and room.numel() == 3):
+        raise ValueError(f"`room` must be a 1D Tensor with 3 elements. Found {room.shape}.")
+    if not (source.ndim == 1 and source.numel() == 3):
+        raise ValueError(f"`source` must be 1D Tensor with 3 elements. Found {source.shape}.")
+    if not (mic_array.ndim == 2 and mic_array.shape[1] == 3):
+        raise ValueError(f"`mic_array` must be a 2D Tensor with shape (num_channels, 3). Found {mic_array.shape}.")
+
+
+def simulate_rir_ism(
+    room: torch.Tensor,
+    source: torch.Tensor,
+    mic_array: torch.Tensor,
+    max_order: int,
+    absorption: Union[float, torch.Tensor],
+    output_length: Optional[int] = None,
+    delay_filter_length: int = 81,
+    center_frequency: Optional[torch.Tensor] = None,
+    sound_speed: float = 343.0,
+    sample_rate: float = 16000.0,
+) -> Tensor:
+    r"""Compute Room Impulse Response (RIR) based on the *image source method* :cite:`allen1979image`.
+    The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Args:
+        room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
+            three dimensions of the room.
+        source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
+        mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
+        max_order (int): The maximum number of reflections of the source.
+        absorption (float or torch.Tensor): The *absorption* :cite:`wiki:Absorption_(acoustics)`
+            coefficients of wall materials for sound energy.
+            If the dtype is ``float``, the absorption coefficient is identical for all walls and
+            all frequencies.
+            If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, where the values represent
+            absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``,
+            and ``"ceiling"``, respectively.
+            If ``absorption`` is a 2D Tensor, the shape must be `(7, 6)`, where 7 represents the number of octave bands.
+        output_length (int or None, optional): The output length of simulated RIR signal. If ``None``,
+            the length is defined as
+
+            .. math::
+                \frac{\text{max\_d} \cdot \text{sample\_rate}}{\text{sound\_speed}} + \text{delay\_filter\_length}
+
+            where ``max_d`` is the maximum distance between image sources and microphones.
+        delay_filter_length (int, optional): The filter length for computing sinc function. (Default: ``81``)
+        center_frequency (torch.Tensor, optional): The center frequencies of octave bands for multi-band walls.
+            Only used when ``absorption`` is a 2D Tensor.
+        sound_speed (float, optional): The speed of sound. (Default: ``343.0``)
+        sample_rate (float, optional): The sample rate of the generated room impulse response signal.
+            (Default: ``16000.0``)
+
+    Returns:
+        (torch.Tensor): The simulated room impulse response waveform. Tensor with dimensions
+        `(channel, rir_length)`.
+
+    Note:
+        If ``absorption`` is a 2D Tensor and ``center_frequency`` is set to ``None``, the center frequencies
+        of octave bands are fixed to ``[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0]``.
+        Users need to tune the values of ``absorption`` to the corresponding frequencies.
+    """
+    _validate_inputs(room, source, mic_array)
+    absorption = _adjust_coeff(absorption, "absorption")
+    img_location, att = _compute_image_sources(room, source, max_order, absorption)
+
+    # compute distances between image sources and microphones
+    vec = img_location[:, None, :] - mic_array[None, :, :]
+    dist = torch.linalg.norm(vec, dim=-1)  # (image_source, channel)
+
+    img_src_att = att[..., None] / dist[None, ...]  # (band, image_source, channel)
+
+    # separate delays in integer / frac part
+    delay = dist * sample_rate / sound_speed  # distance to delay in samples
+    delay_i = torch.ceil(delay)  # integer part
+
+    # compute the shorts IRs corresponding to each image source
+    irs = img_src_att[..., None] * _frac_delay(delay, delay_i, delay_filter_length)[None, ...]
+
+    rir_length = int(delay_i.max() + irs.shape[-1])
+    rir = torch.ops.torchaudio._simulate_rir(irs, delay_i.type(torch.int32), rir_length)
+
+    # multi-band processing
+    if absorption.shape[0] > 1:
+        if center_frequency is None:
+            center = torch.tensor(
+                [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0], dtype=room.dtype, device=room.device
+            )
+        else:
+            center = center_frequency
+        # n_fft is set to 512 by default.
+        filters = torch.ops.torchaudio._make_rir_filter(center, sample_rate, n_fft=512)
+        rir = torchaudio.functional.fftconvolve(rir, filters.unsqueeze(1).repeat(1, rir.shape[1], 1), mode="same")
+
+    # sum up rir signals of all image sources into one waveform.
+    rir = rir.sum(0)
+
+    if output_length is not None:
+        if output_length > rir.shape[-1]:
+            rir = torch.nn.functional.pad(rir, (0, output_length - rir.shape[-1]), "constant", 0.0)
+        else:
+            rir = rir[..., :output_length]
+
+    return rir
+
+
+def ray_tracing(
+    room: torch.Tensor,
+    source: torch.Tensor,
+    mic_array: torch.Tensor,
+    num_rays: int,
+    absorption: Union[float, torch.Tensor] = 0.0,
+    scattering: Union[float, torch.Tensor] = 0.0,
+    mic_radius: float = 0.5,
+    sound_speed: float = 343.0,
+    energy_thres: float = 1e-7,
+    time_thres: float = 10.0,
+    hist_bin_size: float = 0.004,
+) -> torch.Tensor:
+    r"""Compute energy histogram via ray tracing.
+
+    The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
+
+    ``num_rays`` rays are casted uniformly in all directions from the source;
+    when a ray intersects a wall, it is reflected and part of its energy is absorbed.
+    It is also scattered (sent directly to the microphone(s)) according to the ``scattering``
+    coefficient.
+    When a ray is close to the microphone, its current energy is recorded in the output
+    histogram for that given time slot.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Args:
+        room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
+            three dimensions of the room.
+        source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
+        mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
+        absorption (float or torch.Tensor, optional): The absorption coefficients of wall materials.
+            (Default: ``0.0``).
+            If the type is ``float``, the absorption coefficient is identical to all walls and
+            all frequencies.
+            If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, representing absorption
+            coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and
+            ``"ceiling"``, respectively.
+            If ``absorption`` is a 2D Tensor, the shape must be  `(num_bands, 6)`.
+            ``num_bands`` is the number of frequency bands (usually 7).
+        scattering(float or torch.Tensor, optional): The scattering coefficients of wall materials. (Default: ``0.0``)
+            The shape and type of this parameter is the same as for ``absorption``.
+        mic_radius(float, optional): The radius of the microphone in meters. (Default: 0.5)
+        sound_speed (float, optional): The speed of sound in meters per second. (Default: ``343.0``)
+        energy_thres (float, optional): The energy level below which we stop tracing a ray. (Default: ``1e-7``)
+            The initial energy of each ray is ``2 / num_rays``.
+        time_thres (float, optional): The maximal duration for which rays are traced. (Unit: seconds) (Default: 10.0)
+        hist_bin_size (float, optional): The size of each bin in the output histogram. (Unit: seconds) (Default: 0.004)
+
+    Returns:
+        (torch.Tensor): The 3D histogram(s) where the energy of the traced ray is recorded.
+            Each bin corresponds to a given time slot.
+            The shape is `(channel, num_bands, num_bins)`, where
+            ``num_bins = ceil(time_thres / hist_bin_size)``.
+            If both ``absorption`` and ``scattering`` are floats, then ``num_bands == 1``.
+    """
+    if time_thres < hist_bin_size:
+        raise ValueError(
+            "`time_thres` must be greater than `hist_bin_size`. "
+            f"Found: hist_bin_size={hist_bin_size}, time_thres={time_thres}."
+        )
+
+    if room.dtype != source.dtype or source.dtype != mic_array.dtype:
+        raise ValueError(
+            "dtype of `room`, `source` and `mic_array` must match. "
+            f"Found: `room` ({room.dtype}), `source` ({source.dtype}) and "
+            f"`mic_array` ({mic_array.dtype})"
+        )
+
+    _validate_inputs(room, source, mic_array)
+    absorption = _adjust_coeff(absorption, "absorption").to(room.dtype)
+    scattering = _adjust_coeff(scattering, "scattering").to(room.dtype)
+
+    # Bring absorption and scattering to the same shape
+    if absorption.shape[0] == 1 and scattering.shape[0] > 1:
+        absorption = absorption.expand(scattering.shape)
+    if scattering.shape[0] == 1 and absorption.shape[0] > 1:
+        scattering = scattering.expand(absorption.shape)
+    if absorption.shape != scattering.shape:
+        raise ValueError(
+            "`absorption` and `scattering` must be broadcastable to the same number of bands and walls. "
+            f"Inferred shapes absorption={absorption.shape} and scattering={scattering.shape}"
+        )
+
+    histograms = torch.ops.torchaudio.ray_tracing(
+        room,
+        source,
+        mic_array,
+        num_rays,
+        absorption,
+        scattering,
+        mic_radius,
+        sound_speed,
+        energy_thres,
+        time_thres,
+        hist_bin_size,
+    )
+
+    return histograms
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/functional/functional.py b/MLPY/Lib/site-packages/torchaudio/prototype/functional/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d14d7af29c5b72249a4b015e6bf6609a6acba78
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/functional/functional.py
@@ -0,0 +1,190 @@
+import math
+import warnings
+from typing import Optional
+
+import torch
+from torchaudio.functional.functional import _create_triangular_filterbank
+
+
+def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
+    r"""Convert Hz to Barks.
+
+    Args:
+        freqs (float): Frequencies in Hz
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Returns:
+        barks (float): Frequency in Barks
+    """
+
+    if bark_scale not in ["schroeder", "traunmuller", "wang"]:
+        raise ValueError('bark_scale should be one of "schroeder", "traunmuller" or "wang".')
+
+    if bark_scale == "wang":
+        return 6.0 * math.asinh(freqs / 600.0)
+    elif bark_scale == "schroeder":
+        return 7.0 * math.asinh(freqs / 650.0)
+    # Traunmuller Bark scale
+    barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
+    # Bark value correction
+    if barks < 2:
+        barks += 0.15 * (2 - barks)
+    elif barks > 20.1:
+        barks += 0.22 * (barks - 20.1)
+
+    return barks
+
+
+def _bark_to_hz(barks: torch.Tensor, bark_scale: str = "traunmuller") -> torch.Tensor:
+    """Convert bark bin numbers to frequencies.
+
+    Args:
+        barks (torch.Tensor): Bark frequencies
+        bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Returns:
+        freqs (torch.Tensor): Barks converted in Hz
+    """
+
+    if bark_scale not in ["schroeder", "traunmuller", "wang"]:
+        raise ValueError('bark_scale should be one of "traunmuller", "schroeder" or "wang".')
+
+    if bark_scale == "wang":
+        return 600.0 * torch.sinh(barks / 6.0)
+    elif bark_scale == "schroeder":
+        return 650.0 * torch.sinh(barks / 7.0)
+    # Bark value correction
+    if any(barks < 2):
+        idx = barks < 2
+        barks[idx] = (barks[idx] - 0.3) / 0.85
+    elif any(barks > 20.1):
+        idx = barks > 20.1
+        barks[idx] = (barks[idx] + 4.422) / 1.22
+
+    # Traunmuller Bark scale
+    freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
+
+    return freqs
+
+
+def _hz_to_octs(freqs, tuning=0.0, bins_per_octave=12):
+    a440 = 440.0 * 2.0 ** (tuning / bins_per_octave)
+    return torch.log2(freqs / (a440 / 16))
+
+
+def barkscale_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_barks: int,
+    sample_rate: int,
+    bark_scale: str = "traunmuller",
+) -> torch.Tensor:
+    r"""Create a frequency bin conversion matrix.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    .. image:: https://download.pytorch.org/torchaudio/doc-assets/bark_fbanks.png
+        :alt: Visualization of generated filter bank
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_barks (int): Number of mel filterbanks
+        sample_rate (int): Sample rate of the audio waveform
+        bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Returns:
+        torch.Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A * barkscale_fbanks(A.size(-1), ...)``.
+
+    """
+
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+
+    # calculate bark freq bins
+    m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
+    m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
+
+    m_pts = torch.linspace(m_min, m_max, n_barks + 2)
+    f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
+
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+
+    if (fb.max(dim=0).values == 0.0).any():
+        warnings.warn(
+            "At least one bark filterbank has all zero values. "
+            f"The value for `n_barks` ({n_barks}) may be set too high. "
+            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+        )
+
+    return fb
+
+
+def chroma_filterbank(
+    sample_rate: int,
+    n_freqs: int,
+    n_chroma: int,
+    *,
+    tuning: float = 0.0,
+    ctroct: float = 5.0,
+    octwidth: Optional[float] = 2.0,
+    norm: int = 2,
+    base_c: bool = True,
+):
+    """Create a frequency-to-chroma conversion matrix. Implementation adapted from librosa.
+
+    Args:
+        sample_rate (int): Sample rate.
+        n_freqs (int): Number of input frequencies.
+        n_chroma (int): Number of output chroma.
+        tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
+        ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
+        octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
+            If ``None``, then disable weighting altogether. (Default: 2.0)
+        norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
+        base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
+
+    Returns:
+        torch.Tensor: Chroma filter bank, with shape `(n_freqs, n_chroma)`.
+    """
+    # Skip redundant upper half of frequency range.
+    freqs = torch.linspace(0, sample_rate // 2, n_freqs)[1:]
+    freq_bins = n_chroma * _hz_to_octs(freqs, bins_per_octave=n_chroma, tuning=tuning)
+    freq_bins = torch.cat((torch.tensor([freq_bins[0] - 1.5 * n_chroma]), freq_bins))
+    freq_bin_widths = torch.cat(
+        (
+            torch.maximum(freq_bins[1:] - freq_bins[:-1], torch.tensor(1.0)),
+            torch.tensor([1]),
+        )
+    )
+
+    # (n_freqs, n_chroma)
+    D = freq_bins.unsqueeze(1) - torch.arange(0, n_chroma)
+
+    n_chroma2 = round(n_chroma / 2)
+
+    # Project to range [-n_chroma/2, n_chroma/2 - 1]
+    D = torch.remainder(D + n_chroma2, n_chroma) - n_chroma2
+
+    fb = torch.exp(-0.5 * (2 * D / torch.tile(freq_bin_widths.unsqueeze(1), (1, n_chroma))) ** 2)
+    fb = torch.nn.functional.normalize(fb, p=norm, dim=1)
+
+    if octwidth is not None:
+        fb *= torch.tile(
+            torch.exp(-0.5 * (((freq_bins.unsqueeze(1) / n_chroma - ctroct) / octwidth) ** 2)),
+            (1, n_chroma),
+        )
+
+    if base_c:
+        fb = torch.roll(fb, -3 * (n_chroma // 12), dims=1)
+
+    return fb
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/__init__.py b/MLPY/Lib/site-packages/torchaudio/prototype/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f2af31c07de6c025b795ba4b07dd7deeb3c3283
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/models/__init__.py
@@ -0,0 +1,36 @@
+from ._conformer_wav2vec2 import (
+    conformer_wav2vec2_base,
+    conformer_wav2vec2_model,
+    conformer_wav2vec2_pretrain_base,
+    conformer_wav2vec2_pretrain_large,
+    conformer_wav2vec2_pretrain_model,
+    ConformerWav2Vec2PretrainModel,
+)
+from ._emformer_hubert import emformer_hubert_base, emformer_hubert_model
+from .conv_emformer import ConvEmformer
+from .hifi_gan import hifigan_vocoder, hifigan_vocoder_v1, hifigan_vocoder_v2, hifigan_vocoder_v3, HiFiGANVocoder
+from .rnnt import conformer_rnnt_base, conformer_rnnt_biasing, conformer_rnnt_biasing_base, conformer_rnnt_model
+from .rnnt_decoder import Hypothesis, RNNTBeamSearchBiasing
+
+__all__ = [
+    "conformer_rnnt_base",
+    "conformer_rnnt_model",
+    "conformer_rnnt_biasing",
+    "conformer_rnnt_biasing_base",
+    "ConvEmformer",
+    "conformer_wav2vec2_model",
+    "conformer_wav2vec2_base",
+    "conformer_wav2vec2_pretrain_model",
+    "conformer_wav2vec2_pretrain_base",
+    "conformer_wav2vec2_pretrain_large",
+    "ConformerWav2Vec2PretrainModel",
+    "emformer_hubert_base",
+    "emformer_hubert_model",
+    "Hypothesis",
+    "RNNTBeamSearchBiasing",
+    "HiFiGANVocoder",
+    "hifigan_vocoder_v1",
+    "hifigan_vocoder_v2",
+    "hifigan_vocoder_v3",
+    "hifigan_vocoder",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cea7f4280252456808a6bc5e9b8aedbaae2f8bcd
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/_conformer_wav2vec2.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/_conformer_wav2vec2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b065dc45a5b33c96a3989beb7792d43b6aaf7c4
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/_conformer_wav2vec2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/_emformer_hubert.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/_emformer_hubert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..276156ea340b649771209c646c33040f5179e9df
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/_emformer_hubert.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/conv_emformer.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/conv_emformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7bba3dd73aebcb430387a73b16170858911e832
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/conv_emformer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/hifi_gan.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/hifi_gan.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0814217ce187d54000ef800af72c68339890b75a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/hifi_gan.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/rnnt.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/rnnt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d61ed5a71713a2d8bca15eebbd974abe16c13da
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/rnnt.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/rnnt_decoder.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/rnnt_decoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52dd176325ed8e20876a582cdc23eda860f8e70a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/models/__pycache__/rnnt_decoder.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/_conformer_wav2vec2.py b/MLPY/Lib/site-packages/torchaudio/prototype/models/_conformer_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d079d553cd991c712aea78e7794099747723fda
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/models/_conformer_wav2vec2.py
@@ -0,0 +1,794 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn, Tensor
+from torch.nn import Module, ModuleList
+from torchaudio.models import Wav2Vec2Model
+from torchaudio.models.conformer import ConformerLayer
+from torchaudio.models.rnnt import _TimeReduction
+from torchaudio.models.wav2vec2 import components
+
+
+def _buffered_arange(max) -> Tensor:
+    """Compute arange using a buffered tensor across function calls.
+    Produces same result as torch.arange(end=max).
+
+    Args:
+        max (int): Ending value for arange.
+    """
+    if not hasattr(_buffered_arange, "buf"):
+        _buffered_arange.buf = torch.LongTensor()
+    if max > _buffered_arange.buf.numel():
+        _buffered_arange.buf.resize_(max)
+        torch.arange(max, out=_buffered_arange.buf)
+    return _buffered_arange.buf[:max]
+
+
+def _sample_negatives(input: Tensor, num_negatives: int, cross_sample_negatives: int) -> Tuple[Tensor, Tensor]:
+    """Sample negative examples from masked input.
+
+    Args:
+        input (Tensor): Tensor of dimension `(batch, frame, dim)`.
+        num_negatives (int): Number of negative examples to sample.
+        cross_sample_negatives (int): Number of negative examples to cross sample.
+
+    Returns:
+        (Tensor, Tensor):
+        Tensor
+            The negative samples.
+        Tensor
+            The indices of the negative samples.
+    """
+    if num_negatives == 0 and cross_sample_negatives == 0:
+        return (
+            torch.zeros(0).to(input.device, input.dtype),
+            torch.zeros(0).to(input.device, input.dtype),
+        )
+
+    B, T, D = input.shape
+    input = input.view(-1, D)
+
+    cross_high = T * B
+    high = T
+
+    assert high > 1
+
+    if num_negatives > 0:
+        tszs = _buffered_arange(T).unsqueeze(-1).expand(-1, num_negatives).flatten()
+
+        neg_idxs = torch.randint(low=0, high=high - 1, size=(B, num_negatives * T))
+        neg_idxs[neg_idxs >= tszs] += 1
+
+    if cross_sample_negatives > 0:
+        tszs = _buffered_arange(T).unsqueeze(-1).expand(-1, cross_sample_negatives).flatten()
+
+        cross_neg_idxs = torch.randint(low=0, high=cross_high - 1, size=(B, cross_sample_negatives * T))
+        cross_neg_idxs[cross_neg_idxs >= tszs] += 1
+
+    if num_negatives > 0:
+        neg_idxs = neg_idxs + (torch.arange(B).unsqueeze(1) * high)
+    else:
+        neg_idxs = cross_neg_idxs
+
+    if cross_sample_negatives > 0 and num_negatives > 0:
+        neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1)
+
+    negs = input[neg_idxs.view(-1)]
+    negs = negs.view(B, T, num_negatives + cross_sample_negatives, D).permute(2, 0, 1, 3)  # NxBxCxT
+
+    return negs, neg_idxs
+
+
+class NegativeSampler(Module):
+    r"""Applies preprocessing to input and then computes negative sampling.
+
+    Args:
+        preprocessor (nn.Module): Transforms input tensor prior to negative sampling.
+        num_negatives (int): Number of negative examples to sample.
+        cross_sample_negatives (int): Number of negative examples to cross sample.
+    """
+
+    def __init__(
+        self,
+        preprocessor: Module,
+        num_negatives: int,
+        cross_sample_negatives: int,
+    ):
+        super().__init__()
+        self.preprocessor = preprocessor
+        self.num_negatives = num_negatives
+        self.cross_sample_negatives = cross_sample_negatives
+
+    def forward(self, input: Tensor) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+        """
+        Args:
+            input (Tensor): Tensor of dimension `(B, T, D)`.
+
+        Returns:
+            (Tensor, Tensor, Optional[Tensor]):
+            Tensor
+                The input tensor after preprocessing, prior to being sampled.
+            Tensor
+                The negative samples.
+            Tensor
+                The indices of the negative samples.
+        """
+        preprocessed = self.preprocessor(input)
+        negs, neg_idxs = _sample_negatives(preprocessed, self.num_negatives, self.cross_sample_negatives)
+        return preprocessed, negs, neg_idxs
+
+
+class FeatureEncoder(Module):
+    """Feature Encoder class, consisting of time reduction and linear layer.
+
+    Args:
+        stride (int): Number of frames to merge for the output frame.
+        input_dim (int): Input dimension of the tensor.
+        output_dim (int): Output dimension of the tensor.
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, stride: int):
+        super().__init__()
+        self.time_reduction_layer = _TimeReduction(stride=stride)
+        self.linear_layer = nn.Linear(input_dim * stride, output_dim)
+
+    def forward(
+        self,
+        x: Tensor,
+        lengths: Optional[Tensor],
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            x (Tensor): Feature Tensor representing log Mel Spectrogram output. shape ``(B, T, D)``.
+            lengths (Tensor or None):
+                Valid length of each input sample. shape: ``(B, )``.
+
+        Returns:
+            (Tensor, Optional[Tensor]):
+                Tensor: output sequence after undergoing time reduction and linear projection.
+                    Shape ``(B, T // stride, D * stride).
+                Optional[Tensor]: output lengths of shape ``(B,)`` if lengths parameter is provided,
+                    otherwise `None`.
+        """
+        if lengths is None:
+            B, T, D = x.shape
+            dummy_lengths = torch.full((B,), T)
+            x, _ = self.time_reduction_layer(x, dummy_lengths)
+            x = self.linear_layer(x)
+            return x, None
+
+        x, lengths = self.time_reduction_layer(x, lengths)
+        x = self.linear_layer(x)
+        return x, lengths
+
+
+class ConformerEncoder(Module):
+    """Conformer Encoder class, consisting of feature projection and conformer modules.
+
+    Args:
+        feature_projection (nn.Module):
+            Projects feature to encoder dimension.
+        conformer (nn.ModuleList)
+            List of Conformer layers.
+    """
+
+    def __init__(
+        self,
+        feature_projection: Module,
+        conformer: ModuleList,
+    ):
+        super().__init__()
+        self.feature_projection = feature_projection
+        self.conformer = conformer
+
+    def _preprocess(
+        self,
+        features: Tensor,
+        lengths: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        x = self.feature_projection(features)
+        if lengths is not None:
+            mask = components._get_padding_mask(x, lengths)
+        else:
+            mask = None
+        return x, mask
+
+    def _get_intermediate_outputs(
+        self,
+        x: Tensor,
+        mask: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> List[Tensor]:
+        if num_layers is not None:
+            if not 0 < num_layers <= len(self.conformer):
+                raise ValueError(f"`num_layers` must be between [1, {len(self.conformer)}]")
+
+        ret: List[Tensor] = []
+
+        x = x.transpose(0, 1)
+        for layer in self.conformer:
+            x = layer(x, mask)
+            ret.append(x.transpose(0, 1))
+            if num_layers is not None and len(ret) >= num_layers:
+                return ret
+        return ret
+
+    def forward(
+        self,
+        features: Tensor,
+        lengths: Optional[Tensor] = None,
+    ) -> Tensor:
+        """
+        Args:
+            features (Tensor): Tensor of features of shape ``(B, T, D)``.
+            lengths (Tensor or None, optional): Valid length of each input sample. shape: ``(B, )``.
+
+        Returns:
+            Tensor: result after applying conformer encoder to features.
+        """
+        x, mask = self._preprocess(features, lengths)
+        x = x.transpose(0, 1)
+        for layer in self.conformer:
+            x = layer(x, mask)
+        return x.transpose(0, 1)
+
+    def extract_features(
+        self,
+        features: Tensor,
+        lengths: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> List[Tensor]:
+        """Returns the list of outputs from the intermediate layers of conformer block in the encoder.
+
+        Args:
+            features (Tensor): Tensor of features of shape ``(B, T, D)``.
+            lengths (Tensor or None, optional): Valid length of each input sample. shape: ``(B, )``.
+
+        Returns:
+            List[Tensor]:
+                Features from requested layers. Each Tensor is of shape: `(batch, time frame, feature dimension)`.
+        """
+        x, masks = self._preprocess(features, lengths)
+        return self._get_intermediate_outputs(x, mask=masks, num_layers=num_layers)
+
+
+class ConformerWav2Vec2PretrainModel(Module):
+    """Conformer Wav2Vec2 pre-train model for training from scratch.
+
+    Note:
+        To build the model, please use one of the factory functions,
+        :py:func:`conformer_wav2vec2_base` or :py:func:`conformer_wav2vec2_large`
+
+    Args:
+        wav2vec2 (nn.Module):
+            Conformer based Wav2Vec2 model, including feature extractor and conformer encoder components.
+        mask_generator (nn.Module):
+            Mask generator that generates the mask for masked prediction during training.
+        negative_sampler (nn.Module):
+            Negative sampler to apply after masking.
+
+    """
+
+    def __init__(
+        self,
+        wav2vec2: Wav2Vec2Model,
+        mask_generator: Module,
+        negative_sampler: Module,
+    ):
+        super().__init__()
+        self.wav2vec2 = wav2vec2
+        self.mask_generator = mask_generator
+        self.negative_sampler = negative_sampler
+
+    def forward(
+        self,
+        features: Tensor,
+        audio_lengths: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor], Tensor, Tensor]:
+        """
+        Args:
+            features (Tensor):
+                Tensor of audio features of shape `(batch, frame, dim)`.
+            audio_lengths (Tensor or None, optional):
+                Tensor of valid length of each valid auidio in the batch.
+                shape: `(batch, )` (Default: ``None``)
+
+        Returns:
+            (Tensor, Optional[Tensor], Tensor, Tensor, Tensor, Tensor):
+            Tensor
+                The masked sequences of probability distribution of shape `(batch, frame dim)`.
+            Tensor or None
+                If ``lengths`` argument was provided, a Tensor of shape `(batch, )` representing
+                valid length in time axis is returns.
+            Tensor
+                The mask indices.
+            Tensor
+                The targets, prior to negative sampling.
+            Tensor
+                The negative samples.
+            Tensor
+                The indices of the negative samples.
+        """
+        x, lengths = self.wav2vec2.feature_extractor(features, audio_lengths)
+
+        if lengths is not None:
+            padding_mask = components._get_padding_mask(x, lengths)
+        else:
+            padding_mask = None
+
+        x = self.wav2vec2.encoder.feature_projection.layer_norm(x)
+        x = self.wav2vec2.encoder.feature_projection.dropout(x)
+
+        # Unmasked feature is used to generate positive and negative samples.
+        unmasked_x = x.clone()
+        # Apply masking to x before passing it to Conformer layers.
+        x, mask_idxs = self.mask_generator(x, padding_mask)
+        # Select the frames from masked indices for negative sampling.
+        unmasked_x = unmasked_x[mask_idxs].view(x.shape[0], -1, x.shape[-1])
+        targets, negs, neg_idxs = self.negative_sampler(unmasked_x)
+
+        x = self.wav2vec2.encoder.feature_projection.projection(x)
+        x = x.transpose(0, 1)
+        for conformer_layer in self.wav2vec2.encoder.conformer:
+            x = conformer_layer(x, padding_mask)
+        x = x.transpose(0, 1)
+
+        return x, lengths, mask_idxs, targets, negs, neg_idxs
+
+
+################################################################################
+def _get_conformer_feature_extractor(
+    input_dim: int,
+    output_dim: int,
+    stride: int,
+) -> FeatureEncoder:
+    """Construct Feature Extractor
+
+    Args:
+        input_dim (int): Input dimension of features.
+        output_dim (int): Output dimension after feature extraction.
+        stride (int): Stride used in Time Reduction layer of feature extractor.
+
+    Returns:
+        FeatureEncoder: The resulting feature extraction.
+    """
+    return FeatureEncoder(input_dim, output_dim, stride)
+
+
+def _get_conformer_encoder(
+    in_features: int,
+    embed_dim: int,
+    dropout_input: float,
+    num_layers: int,
+    num_heads: int,
+    ff_interm_features: int,
+    dropout: float,
+    depthwise_conv_kernel_size: Union[int, List[int]],
+    convolution_first: bool,
+    use_group_norm: bool,
+) -> ConformerEncoder:
+    """Construct Conformer Encoder
+
+    Args:
+        in_features (int): The number of input features.
+        embed_dim (int): The dimension of the embedding in the feature projection.
+        dropout_input (float): The dropout probability applied after the input feature
+            is projected to ``embed_dim``.
+        num_layers (int): Number of Conformer layers in the encoder.
+        num_heads (int): Number of heads in each Conformer layer.
+        ff_interm_features (int): Hidden layer dimension of the feedforward network in
+            each Conformer layer.
+        dropout (float): Dropout probability in each Conformer layer.
+        depthwise_conv_kernel_size (int or List[int]): List of kernel sizes corresponding
+            to each of the  Conformer layers.If int is provided, all layers will have the
+            same kernel size.
+        convolution_first (bool): Whether to apply the convolution module ahead of the
+            attention module in each Conformer layer.
+        use_group_norm (bool): Whether to use ``GroupNorm`` rather than ``BatchNorm1d`` in
+            the convolution module in each Conformer layer.
+
+    Returns:
+        ConformerEncoder:
+            The resulting conformer encoder module.
+    """
+    feature_projection = components.FeatureProjection(in_features, embed_dim, dropout_input)
+
+    if type(depthwise_conv_kernel_size) == int:
+        depthwise_conv_kernel_size = [depthwise_conv_kernel_size] * num_layers
+
+    assert len(depthwise_conv_kernel_size) == num_layers
+
+    conformer_layers = []
+    for l in range(num_layers):
+        layer = ConformerLayer(
+            input_dim=embed_dim,
+            ffn_dim=ff_interm_features,
+            num_attention_heads=num_heads,
+            depthwise_conv_kernel_size=depthwise_conv_kernel_size[l],
+            dropout=dropout,
+            use_group_norm=use_group_norm,
+            convolution_first=convolution_first,
+        )
+        conformer_layers.append(layer)
+
+    return ConformerEncoder(feature_projection, ModuleList(conformer_layers))
+
+
+def _get_conformer_negativer_sampler(
+    input_dim: int,
+    output_dim: int,
+    num_negatives: int,
+    cross_sample_negatives: int,
+) -> NegativeSampler:
+    """Build custom NegativeSampler module, including linear layer and negative sampling.
+
+    Args:
+        input_dim (int): Dimension of input after feature extraction.
+        output_dim (int): Dimension of embedding for use in negative sampling. Same as the
+            embedding in the feature projection.
+        num_negatives (int): Number of negatives to sample.
+        cross_sample_negatives (int): Number of cross sampled negatives.
+
+    Returns:
+        NegativeSampler:
+            The resulting negative sampler module.
+    """
+    preprocessor = nn.Linear(input_dim, output_dim)
+    return NegativeSampler(preprocessor, num_negatives, cross_sample_negatives)
+
+
+def conformer_wav2vec2_model(
+    extractor_input_dim: int,
+    extractor_output_dim: int,
+    extractor_stride: int,
+    encoder_embed_dim: int,
+    encoder_projection_dropout: float,
+    encoder_num_layers: int,
+    encoder_num_heads: int,
+    encoder_ff_interm_features: int,
+    encoder_depthwise_conv_kernel_size: Union[int, List[int]],
+    encoder_dropout: float,
+    encoder_convolution_first: bool,
+    encoder_use_group_norm: bool,
+) -> Wav2Vec2Model:
+    """Build a custom Conformer Wav2Vec2Model
+
+    Args:
+        extractor_input_dim (int): Input dimension of the features.
+        extractor_output_dim (int): Output dimension after feature extraction.
+        extractor_stride (int): Stride used in time reduction layer of feature extraction.
+        encoder_embed_dim (int): The dimension of the embedding in the feature projection.
+        encoder_projection_dropout (float):
+            The dropout probability applied after the input feature is projected to ``embed_dim``
+        encoder_num_layers (int): Number of Conformer layers in the encoder.
+        encoder_num_heads (int): Number of heads in each Conformer layer.
+        encoder_ff_interm_features (int):
+            Hidden layer dimension of the feedforward network in each Conformer layer.
+        encoder_depthwise_conv_kernel_size (int or List[int]):
+            List of kernel sizes corresponding to each of the Conformer layers.
+            If int is provided, all layers will have the same kernel size.
+        encoder_dropout (float): Dropout probability in each Conformer layer.
+        encoder_convolution_first (bool):
+            Whether to apply the convolution module ahead of the attention module
+            in each Conformer layer.
+        encoder_use_group_norm (bool):
+            Whether to use ``GroupNorm`` rather than ``BatchNorm1d`` in the convolution
+            module in each Conformer layer.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting wav2vec2 model with a conformer encoder.
+    """
+    feature_extractor = _get_conformer_feature_extractor(
+        extractor_input_dim,
+        extractor_output_dim,
+        extractor_stride,
+    )
+
+    encoder = _get_conformer_encoder(
+        in_features=extractor_output_dim,
+        embed_dim=encoder_embed_dim,
+        dropout_input=encoder_projection_dropout,
+        num_layers=encoder_num_layers,
+        num_heads=encoder_num_heads,
+        ff_interm_features=encoder_ff_interm_features,
+        depthwise_conv_kernel_size=encoder_depthwise_conv_kernel_size,
+        dropout=encoder_dropout,
+        convolution_first=encoder_convolution_first,
+        use_group_norm=encoder_use_group_norm,
+    )
+
+    return Wav2Vec2Model(feature_extractor, encoder)
+
+
+def conformer_wav2vec2_base(
+    extractor_input_dim: int = 64,
+    extractor_output_dim: int = 256,
+    encoder_projection_dropout: float = 0.0,
+) -> Wav2Vec2Model:
+    """
+    Build Conformer Wav2Vec2 Model with "small" architecture from
+    *Conformer-Based Slef-Supervised Learning for Non-Speech Audio Tasks* :cite:`9746490`
+
+    Args:
+        extractor_input_dim (int, optional): Input dimension of feature extractor. (Default: 64)
+        extractor_output_dim (int, optional): Output dimension of feature extractor. (Default: 256)
+        encoder_projection_dropout (float, optional):
+            Dropout probability applied after feature projection. (Default: 0.0)
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting wav2vec2 model with a conformer encoder and ``base`` configuration.
+    """
+    return conformer_wav2vec2_model(
+        extractor_input_dim=extractor_input_dim,
+        extractor_output_dim=extractor_output_dim,
+        extractor_stride=4,
+        encoder_embed_dim=256,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_num_layers=12,
+        encoder_num_heads=8,
+        encoder_ff_interm_features=1024,
+        encoder_depthwise_conv_kernel_size=[31] + [15] * 11,
+        encoder_dropout=0.1,
+        encoder_convolution_first=True,
+        encoder_use_group_norm=True,
+    )
+
+
+def conformer_wav2vec2_pretrain_model(
+    extractor_input_dim: int,
+    extractor_output_dim: int,
+    extractor_stride: int,
+    encoder_embed_dim: int,
+    encoder_projection_dropout: float,
+    encoder_num_layers: int,
+    encoder_num_heads: int,
+    encoder_ff_interm_features: int,
+    encoder_depthwise_conv_kernel_size: int,
+    encoder_dropout: float,
+    encoder_convolution_first: bool,
+    encoder_use_group_norm: bool,
+    mask_prob: float,
+    mask_selection: str,
+    mask_other: float,
+    mask_length: int,
+    no_mask_overlap: bool,
+    mask_min_space: int,
+    mask_channel_prob: float,
+    mask_channel_selection: str,
+    mask_channel_other: float,
+    mask_channel_length: int,
+    no_mask_channel_overlap: bool,
+    mask_channel_min_space: int,
+    num_negatives: int,
+    cross_sample_negatives: int,
+) -> ConformerWav2Vec2PretrainModel:
+    """Build a custom Conformer Wav2Vec2 Model for pre-training
+
+    Args:
+        extractor_input_dim (int): Input dimension of the features.
+        extractor_output_dim (int): Output dimension after feature extraction.
+        extractor_stride (int):
+            Stride used in time reduction layer of feature extraction.
+        encoder_embed_dim (int):
+            The dimension of the embedding in the feature projection.
+        encoder_projection_dropout (float):
+            The dropout probability applied after the input feature is projected to
+            ``embed_dim``
+        encoder_num_layers (int):
+            Number of Conformer layers in the encoder.
+        encoder_num_heads (int):
+            Number of heads in each Conformer layer.
+        encoder_ff_interm_features (int):
+            Hidden layer dimension of the feedforward network in each Conformer layer.
+        encoder_depthwise_conv_kernel_size (int or List[int]):
+            List of kernel sizes corresponding to each of the Conformer layers.
+            If int is provided, all layers will have the same kernel size.
+        encoder_dropout (float):
+            Dropout probability in each Conformer layer.
+        encoder_convolution_first (bool):
+            Whether to apply the convolution module ahead of the attention module
+            in each Conformer layer.
+        encoder_use_group_norm (bool):
+            Whether to use ``GroupNorm`` rather than ``BatchNorm1d`` in the convolution
+            module in each Conformer layer.
+        mask_prob (float):
+            Probability for each token to be chosen as start of the span to be masked.
+        mask_selection (str)
+            How to choose the mask length. Options: [``static``, ``uniform``, ``normal``, ``poisson``].
+        mask_other (float):
+            Secondary mask argument (used for more complex distributions).
+        mask_length (int):
+            The lengths of the mask.
+        no_mask_overlap (bool):
+            Whether to allow masks to overlap.
+        mask_min_space (int):
+            Minimum space between spans (if no overlap is enabled).
+        mask_channel_prob: (float):
+            The probability of replacing a feature with 0.
+        mask_channel_selection (str):
+            How to choose the mask length for channel masking.
+            Options: [``static``, ``uniform``, ``normal``, ``poisson``].
+        mask_channel_other (float):
+            Secondary mask argument for channel masking (used for more complex distributions).
+        mask_channel_length (int):
+            Minimum space between spans (if no overlap is enabled) for channel masking.
+        no_mask_channel_overlap (bool):
+            Whether to allow channel masks to overlap.
+        mask_channel_min_space (int):
+            Minimum space between spans for channel masking (if no overlap is enabled).
+        num_negatives (int):
+            Number of negatives to sample.
+        cross_sample_negatives (int):
+            Number of cross sampled negatives.
+
+    Returns:
+        ConformerWav2Vec2PretrainModel:
+            The resulting model.
+    """
+    wav2vec2 = conformer_wav2vec2_model(
+        extractor_input_dim,
+        extractor_output_dim,
+        extractor_stride,
+        encoder_embed_dim,
+        encoder_projection_dropout,
+        encoder_num_layers,
+        encoder_num_heads,
+        encoder_ff_interm_features,
+        encoder_depthwise_conv_kernel_size,
+        encoder_dropout,
+        encoder_convolution_first,
+        encoder_use_group_norm,
+    )
+
+    mask_generator = components.MaskGenerator(
+        extractor_output_dim,
+        mask_prob,
+        mask_selection,
+        mask_other,
+        mask_length,
+        no_mask_overlap,
+        mask_min_space,
+        mask_channel_prob,
+        mask_channel_selection,
+        mask_channel_other,
+        mask_channel_length,
+        no_mask_channel_overlap,
+        mask_channel_min_space,
+    )
+
+    negative_sampler = _get_conformer_negativer_sampler(
+        extractor_output_dim,
+        encoder_embed_dim,
+        num_negatives,
+        cross_sample_negatives,
+    )
+
+    return ConformerWav2Vec2PretrainModel(
+        wav2vec2=wav2vec2,
+        mask_generator=mask_generator,
+        negative_sampler=negative_sampler,
+    )
+
+
+def conformer_wav2vec2_pretrain_base(
+    extractor_input_dim: int = 64,
+    extractor_output_dim: int = 256,
+    encoder_projection_dropout: float = 0.0,
+    mask_prob: float = 0.3,
+    mask_length: int = 3,
+    num_negatives: int = 100,
+    cross_sample_negatives: int = 0,
+) -> ConformerWav2Vec2PretrainModel:
+    """Build Conformer Wav2Vec2 Model for pre-training with "small" architecture from
+    *Conformer-Based Self-Supervised Learning for Non-Speech Audio Tasks* :cite:`9746490`
+
+    Args:
+        extractor_input_dim (int, optional): Input dimension of the features. (Default: 64)
+        extractor_output_dim (int, optional): Output dimension after feature extraction. (Default: 256)
+        encoder_projection_dropout (float, optional):
+            The dropout probability applied after the input feature is projected to
+            ``embed_dim``. (Default: 0.0)
+        mask_prob (float, optional):
+            Probability for each token to be chosen as start of the span to be masked. (Default: 0.3)
+        mask_length (int, optional):
+            The lengths of the mask. (Default: 3)
+        num_negatives (int, optional):
+            Number of sampled negatives. (Default: 0)
+        cross_sample_negatives (int, optional):
+            Number of cross sampled negatives. (Default: 0)
+
+    Returns:
+        ConformerWav2Vec2PretrainModel:
+            The resulting model.
+    """
+    return conformer_wav2vec2_pretrain_model(
+        extractor_input_dim=extractor_input_dim,
+        extractor_output_dim=extractor_output_dim,
+        extractor_stride=4,
+        encoder_embed_dim=256,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_num_layers=12,
+        encoder_num_heads=8,
+        encoder_ff_interm_features=1024,
+        encoder_depthwise_conv_kernel_size=[31] + [15] * 11,
+        encoder_dropout=0.1,
+        encoder_convolution_first=True,
+        encoder_use_group_norm=True,
+        mask_prob=mask_prob,
+        mask_selection="static",
+        mask_other=0.0,
+        mask_length=mask_length,
+        no_mask_overlap=False,
+        mask_min_space=0,
+        mask_channel_prob=0,
+        mask_channel_selection="static",
+        mask_channel_other=0,
+        mask_channel_length=10,
+        no_mask_channel_overlap=False,
+        mask_channel_min_space=1,
+        num_negatives=num_negatives,
+        cross_sample_negatives=cross_sample_negatives,
+    )
+
+
+def conformer_wav2vec2_pretrain_large(
+    extractor_input_dim: int = 64,
+    extractor_output_dim: int = 256,
+    encoder_projection_dropout: float = 0.0,
+    mask_prob: float = 0.3,
+    mask_length: int = 3,
+    num_negatives: int = 100,
+    cross_sample_negatives: int = 0,
+) -> ConformerWav2Vec2PretrainModel:
+    """Build Conformer Wav2Vec2 Model for pre-training with "large" architecture from
+    *Conformer-Based Slef-Supervised Learning for Non-Speech Audio Tasks* :cite:`9746490`
+
+    Args:
+        extractor_input_dim (int, optional): Input dimension of the features. (Default: 64)
+        extractor_output_dim (int, optional): Output dimension after feature extraction. (Default: 256)
+        encoder_projection_dropout (float, optional):
+            The dropout probability applied after the input feature is projected to
+            ``embed_dim``. (Default: 0.0)
+        mask_prob (float, optional):
+            Probability for each token to be chosen as start of the span to be masked. (Default: 0.3)
+        mask_length (int, optional):
+            The lengths of the mask. (Default: 3)
+        num_negatives (int, optional):
+            Number of sampled negatives. (Default: 0)
+        cross_sample_negatives (int, optional):
+            Number of cross sampled negatives. (Default: 0)
+
+    Returns:
+        ConformerWav2Vec2PretrainModel:
+            The resulting model.
+    """
+    return conformer_wav2vec2_pretrain_model(
+        extractor_input_dim=extractor_input_dim,
+        extractor_output_dim=extractor_output_dim,
+        extractor_stride=4,
+        encoder_embed_dim=768,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_num_layers=12,
+        encoder_num_heads=12,
+        encoder_ff_interm_features=1024,
+        encoder_depthwise_conv_kernel_size=[31] + [15] * 11,
+        encoder_dropout=0.1,
+        encoder_convolution_first=True,
+        encoder_use_group_norm=True,
+        mask_prob=mask_prob,
+        mask_selection="static",
+        mask_other=0.0,
+        mask_length=mask_length,
+        no_mask_overlap=False,
+        mask_min_space=0,
+        mask_channel_prob=0,
+        mask_channel_selection="static",
+        mask_channel_other=0,
+        mask_channel_length=10,
+        no_mask_channel_overlap=False,
+        mask_channel_min_space=1,
+        num_negatives=num_negatives,
+        cross_sample_negatives=cross_sample_negatives,
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/_emformer_hubert.py b/MLPY/Lib/site-packages/torchaudio/prototype/models/_emformer_hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdf13761bcbe40f16cf04207a14abd606edd64ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/models/_emformer_hubert.py
@@ -0,0 +1,333 @@
+from typing import List, Optional, Tuple
+
+import torch
+from torchaudio.models import Wav2Vec2Model
+from torchaudio.models.emformer import Emformer
+from torchaudio.models.rnnt import _TimeReduction
+
+
+class FeatureEncoder(torch.nn.Module):
+    """Extract features from log-mel spectrogram input. Consists of linear layer and time reduction layer.
+
+    Args:
+        input_dim (int): The feature dimension of log-mel spectrogram feature.
+        output_dim (int): The feature dimension after linear layer.
+        use_bias (bool): If ``True``, enable bias parameter in the linear layer.
+        stride (int): Number of frames to merge for the output frame.
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, use_bias: bool, stride: int):
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=use_bias)
+        self.time_reduction = _TimeReduction(stride)
+
+    def forward(
+        self, input: torch.Tensor, lengths: Optional[torch.Tensor]
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Args:
+            input (torch.Tensor): The log-mel spectrogram input.
+                Tensor with dimensions `(batch, time, input_dim)`.
+            lengths (torch.Tensor or None): Valid length of each input sample.
+                Tensor with dimension `(batch, )`.
+
+        Returns:
+            (torch.Tensor, torch.Tensor or None):
+                torch.Tensor
+                    Returned feature Tensor after linear layer and time reduction layer.
+                    Tensor with dimensions `(batch, time // stride, output_dim)`.
+                torch.Tensor or None
+                    The reduced lengths Tensor.
+        """
+        output = self.linear(input)
+        if lengths is None:
+            B, T, _ = input.shape
+            dummy_lengths = torch.full((B,), T)
+            output, _ = self.time_reduction(output, dummy_lengths)
+        else:
+            output, lengths = self.time_reduction(output, lengths)
+        return output, lengths
+
+
+class EmformerEncoder(torch.nn.Module):
+    """Emformer Encoder class for HuBERT pre-training. Consists of emformer module,
+        linear layer and layer normalization layer.
+
+    Args:
+        emformer (torch.nn.Module):
+            :py:class:`torchaudio.models.Emformer` module that consists of a list of emformer layers.
+        output_linear (torch.nn.Module):
+            Linear layer after emformer module.
+        layer_norm (torch.nn.Module):
+            Apply layer normalization to the output.
+    """
+
+    def __init__(
+        self,
+        emformer: torch.nn.Module,
+        output_linear: torch.nn.Module,
+        layer_norm: torch.nn.Module,
+    ):
+        super().__init__()
+        self.emformer = emformer
+        self.output_linear = output_linear
+        self.layer_norm = layer_norm
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): The input feature for emformer encoder.
+                Tensor with dimensions `(batch, time, feature_dim)`.
+            lengths (torch.Tensor or None): Valid length of each input sample.
+                Tensor with dimension `(batch, )`.
+
+        Returns:
+            torch.Tensor: The feature Tensor after emformer encoder.
+        """
+        if lengths is None:
+            B, T, _ = input.shape
+            dummy_lengths = torch.full((B,), T)
+            output, _ = self.emformer(input, dummy_lengths)
+        else:
+            output, lengths = self.emformer(input, lengths)
+        output = self.output_linear(output)
+        output = self.layer_norm(output)
+        return output
+
+    def extract_features(
+        self,
+        input: torch.Tensor,
+        lengths: Optional[torch.Tensor],
+        num_layers: Optional[int] = None,
+    ) -> List[torch.Tensor]:
+        """Extract output Tensors of the emformer layers.
+
+        Args:
+            input (torch.Tensor): The input feature for emformer encoder.
+                Tensor with dimensions `(batch, time, feature_dim)`.
+            lengths (torch.Tensor or None): Valid length of each input sample.
+                Tensor with dimension `(batch, )`.
+            num_layers (int or None, optional): If not ``None``, returns the first
+                `num_layers` layers of Tensors as the output, otherwise returns the
+                Tensors from all emformer layers.
+
+        Returns:
+            List[torch.Tensor]:
+                Output Tensors of selected emformer layers.
+        """
+        if num_layers is not None:
+            if not 0 < num_layers <= len(self.emformer.emformer_layers):
+                raise ValueError(f"`num_layers` must be between [1, {len(self.emformer.emformer_layers)}]")
+
+        ret: List[torch.Tensor] = []
+
+        input = input.permute(1, 0, 2)
+        right_context = self.emformer._gen_right_context(input)
+        utterance = input[: input.size(0) - self.emformer.right_context_length]
+        attention_mask = self.emformer._gen_attention_mask(utterance)
+        mems = (
+            self.emformer.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[:-1]
+            if self.emformer.use_mem
+            else torch.empty(0).to(dtype=input.dtype, device=input.device)
+        )
+        output = utterance
+        if lengths is None:
+            B, T, _ = input.shape
+            lengths = torch.full((B,), T)
+        for layer in self.emformer.emformer_layers:
+            output, right_context, mems = layer(output, lengths, right_context, mems, attention_mask)
+            ret.append(output.permute(1, 0, 2))
+            if num_layers is not None and len(ret) >= num_layers:
+                return ret
+        return ret
+
+
+def _get_emformer_feature_extractor(input_dim: int, output_dim: int, use_bias: bool, stride: int) -> FeatureEncoder:
+    """Construct FeatureEncoder for emformer model.
+
+    Args:
+        input_dim (int): The feature dimension of log-mel spectrogram feature.
+        output_dim (int): The feature dimension after linear layer.
+        use_bias (bool): If ``True``, enable bias parameter in the linear layer.
+        stride (int): Number of frames to merge for the output frame.
+
+    Returns:
+        FeatureEncoder: The resulting FeatureEncoder module.
+    """
+    return FeatureEncoder(input_dim, output_dim, use_bias, stride)
+
+
+def _get_emformer_encoder(
+    input_dim: int,
+    output_dim: int,
+    num_heads: int,
+    ffn_dim: int,
+    num_layers: int,
+    segment_length: int,
+    left_context_length: int,
+    right_context_length: int,
+    dropout: float,
+    activation: str,
+    max_memory_size: int,
+    weight_init_scale_strategy: Optional[str],
+    tanh_on_mem: bool,
+) -> EmformerEncoder:
+    """Construct EmformerEncoder for emformer model.
+
+    Args:
+        input_dim (int): The feature dimension of input Tensor.
+        output_dim (int): The feature dimension after EmformerEncoder.
+        num_heads (int): Number of attention heads in each Emformer layer.
+        ffn_dim: (int): Hidden layer dimension of feedforward network.
+        num_layers (int): Number of Emformer layers to instantiate.
+        segment_length (int): Length of each input segment.
+        left_context_length (int): Length of left context.
+        right_context_length (int): Length of right context.
+        dropout (float): Dropout probability.
+        activation (str): Activation function to use in each Emformer layer's
+            feedforward network. Must be one of ("relu", "gelu", "silu").
+        max_memory_size (int): Maximum number of memory elements to use.
+        weight_init_scale_strategy (str or None): Per-layer weight initialization scaling
+            strategy. Must be one of ("depthwise", "constant", ``None``).
+        tanh_on_mem (bool): If ``True``, applies tanh to memory elements.
+
+    Returns:
+        EmformerEncoder: The resulting EmformerEncoder module.
+    """
+    emformer = Emformer(
+        input_dim=input_dim,
+        num_heads=num_heads,
+        ffn_dim=ffn_dim,
+        num_layers=num_layers,
+        segment_length=segment_length,
+        left_context_length=left_context_length,
+        right_context_length=right_context_length,
+        dropout=dropout,
+        activation=activation,
+        max_memory_size=max_memory_size,
+        weight_init_scale_strategy=weight_init_scale_strategy,
+        tanh_on_mem=tanh_on_mem,
+    )
+    output_linear = torch.nn.Linear(input_dim, output_dim)
+    layer_norm = torch.nn.LayerNorm(output_dim)
+    return EmformerEncoder(emformer, output_linear, layer_norm)
+
+
+def emformer_hubert_model(
+    extractor_input_dim: int,
+    extractor_output_dim: int,
+    extractor_use_bias: bool,
+    extractor_stride: int,
+    encoder_input_dim: int,
+    encoder_output_dim: int,
+    encoder_num_heads: int,
+    encoder_ffn_dim: int,
+    encoder_num_layers: int,
+    encoder_segment_length: int,
+    encoder_left_context_length: int,
+    encoder_right_context_length: int,
+    encoder_dropout: float,
+    encoder_activation: str,
+    encoder_max_memory_size: int,
+    encoder_weight_init_scale_strategy: Optional[str],
+    encoder_tanh_on_mem: bool,
+    aux_num_out: Optional[int],
+) -> Wav2Vec2Model:
+    """Build a custom Emformer HuBERT model.
+
+    Args:
+        extractor_input_dim (int): The input dimension for feature extractor.
+        extractor_output_dim (int): The output dimension after feature extractor.
+        extractor_use_bias (bool): If ``True``, enable bias parameter in the linear layer of feature extractor.
+        extractor_stride (int): Number of frames to merge for the output frame in feature extractor.
+        encoder_input_dim (int): The input dimension for Emformer layer.
+        encoder_output_dim (int): The output dimension after EmformerEncoder.
+        encoder_num_heads (int): Number of attention heads in each Emformer layer.
+        encoder_ffn_dim (int): Hidden layer dimension of feedforward network in Emformer.
+        encoder_num_layers (int): Number of Emformer layers to instantiate.
+        encoder_segment_length (int): Length of each input segment.
+        encoder_left_context_length (int): Length of left context.
+        encoder_right_context_length (int): Length of right context.
+        encoder_dropout (float): Dropout probability.
+        encoder_activation (str): Activation function to use in each Emformer layer's
+            feedforward network. Must be one of ("relu", "gelu", "silu").
+        encoder_max_memory_size (int): Maximum number of memory elements to use.
+        encoder_weight_init_scale_strategy (str or None): Per-layer weight initialization scaling
+            strategy. Must be one of ("depthwise", "constant", ``None``).
+        encoder_tanh_on_mem (bool): If ``True``, applies tanh to memory elements.
+        aux_num_out (int or None):
+            When provided, attach an extra linear layer on top of encoder, which can be
+            used for fine-tuning.
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting :py:class:`torchaudio.models.Wav2Vec2Model` model
+            with a :py:class:`torchaudio.models.Emformer` encoder.
+    """
+    feature_extractor = _get_emformer_feature_extractor(
+        extractor_input_dim, extractor_output_dim, extractor_use_bias, extractor_stride
+    )
+    emformer = _get_emformer_encoder(
+        encoder_input_dim,
+        encoder_output_dim,
+        encoder_num_heads,
+        encoder_ffn_dim,
+        encoder_num_layers,
+        encoder_segment_length,
+        encoder_left_context_length,
+        encoder_right_context_length,
+        encoder_dropout,
+        encoder_activation,
+        encoder_max_memory_size,
+        encoder_weight_init_scale_strategy,
+        encoder_tanh_on_mem,
+    )
+    aux = None
+    if aux_num_out is not None:
+        aux = torch.nn.Linear(in_features=encoder_output_dim, out_features=aux_num_out)
+    return Wav2Vec2Model(feature_extractor, emformer, aux)
+
+
+def emformer_hubert_base(
+    extractor_input_dim: int = 80,
+    extractor_output_dim: int = 128,
+    encoder_dropout: float = 0.1,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Build Emformer HuBERT Model with 20 Emformer layers.
+
+    Args:
+        extractor_input_dim (int, optional): The input dimension for feature extractor. (Default: 80)
+        extractor_output_dim (int, optional): The output dimension after feature extractor. (Default: 128)
+        encoder_dropout (float, optional): Dropout probability in Emformer. (Default: 0.1)
+        aux_num_out (int or None, optional): Output dimension of aux layer for fine-tuning. (Default: ``None``)
+
+    Returns:
+        Wav2Vec2Model:
+            The resulting :py:class:`torchaudio.models.Wav2Vec2Model` model
+            with a :py:class:`torchaudio.models.Emformer` encoder.
+    """
+    return emformer_hubert_model(
+        extractor_input_dim=extractor_input_dim,
+        extractor_output_dim=extractor_output_dim,
+        extractor_use_bias=False,
+        extractor_stride=4,
+        encoder_input_dim=512,
+        encoder_output_dim=1024,
+        encoder_num_heads=8,
+        encoder_ffn_dim=2048,
+        encoder_num_layers=20,
+        encoder_segment_length=4,
+        encoder_left_context_length=30,
+        encoder_right_context_length=1,
+        encoder_dropout=encoder_dropout,
+        encoder_activation="gelu",
+        encoder_max_memory_size=0,
+        encoder_weight_init_scale_strategy="depthwise",
+        encoder_tanh_on_mem=True,
+        aux_num_out=aux_num_out,
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/conv_emformer.py b/MLPY/Lib/site-packages/torchaudio/prototype/models/conv_emformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..75a1e474c909d984e03edd44409a8a161747a637
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/models/conv_emformer.py
@@ -0,0 +1,525 @@
+import math
+from typing import List, Optional, Tuple
+
+import torch
+from torchaudio.models.emformer import _EmformerAttention, _EmformerImpl, _get_weight_init_gains
+
+
+def _get_activation_module(activation: str) -> torch.nn.Module:
+    if activation == "relu":
+        return torch.nn.ReLU()
+    elif activation == "gelu":
+        return torch.nn.GELU()
+    elif activation == "silu":
+        return torch.nn.SiLU()
+    else:
+        raise ValueError(f"Unsupported activation {activation}")
+
+
+class _ResidualContainer(torch.nn.Module):
+    def __init__(self, module: torch.nn.Module, output_weight: int):
+        super().__init__()
+        self.module = module
+        self.output_weight = output_weight
+
+    def forward(self, input: torch.Tensor):
+        output = self.module(input)
+        return output * self.output_weight + input
+
+
+class _ConvolutionModule(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        segment_length: int,
+        right_context_length: int,
+        kernel_size: int,
+        activation: str = "silu",
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.segment_length = segment_length
+        self.right_context_length = right_context_length
+        self.state_size = kernel_size - 1
+
+        self.pre_conv = torch.nn.Sequential(
+            torch.nn.LayerNorm(input_dim), torch.nn.Linear(input_dim, 2 * input_dim, bias=True), torch.nn.GLU()
+        )
+        self.conv = torch.nn.Conv1d(
+            in_channels=input_dim,
+            out_channels=input_dim,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=0,
+            groups=input_dim,
+        )
+        self.post_conv = torch.nn.Sequential(
+            torch.nn.LayerNorm(input_dim),
+            _get_activation_module(activation),
+            torch.nn.Linear(input_dim, input_dim, bias=True),
+            torch.nn.Dropout(p=dropout),
+        )
+
+    def _split_right_context(self, utterance: torch.Tensor, right_context: torch.Tensor) -> torch.Tensor:
+        T, B, D = right_context.size()
+        if T % self.right_context_length != 0:
+            raise ValueError("Tensor length should be divisible by its right context length")
+        num_segments = T // self.right_context_length
+        # (num_segments, right context length, B, D)
+        right_context_segments = right_context.reshape(num_segments, self.right_context_length, B, D)
+        right_context_segments = right_context_segments.permute(0, 2, 1, 3).reshape(
+            num_segments * B, self.right_context_length, D
+        )
+
+        pad_segments = []  # [(kernel_size - 1, B, D), ...]
+        for seg_idx in range(num_segments):
+            end_idx = min(self.state_size + (seg_idx + 1) * self.segment_length, utterance.size(0))
+            start_idx = end_idx - self.state_size
+            pad_segments.append(utterance[start_idx:end_idx, :, :])
+
+        pad_segments = torch.cat(pad_segments, dim=1).permute(1, 0, 2)  # (num_segments * B, kernel_size - 1, D)
+        return torch.cat([pad_segments, right_context_segments], dim=1).permute(0, 2, 1)
+
+    def _merge_right_context(self, right_context: torch.Tensor, B: int) -> torch.Tensor:
+        # (num_segments * B, D, right_context_length)
+        right_context = right_context.reshape(-1, B, self.input_dim, self.right_context_length)
+        right_context = right_context.permute(0, 3, 1, 2)
+        return right_context.reshape(-1, B, self.input_dim)  # (right_context_length * num_segments, B, D)
+
+    def forward(
+        self, utterance: torch.Tensor, right_context: torch.Tensor, state: Optional[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        input = torch.cat((right_context, utterance))  # input: (T, B, D)
+        x = self.pre_conv(input)
+        x_right_context, x_utterance = x[: right_context.size(0), :, :], x[right_context.size(0) :, :, :]
+        x_utterance = x_utterance.permute(1, 2, 0)  # (B, D, T_utterance)
+
+        if state is None:
+            state = torch.zeros(
+                input.size(1),
+                input.size(2),
+                self.state_size,
+                device=input.device,
+                dtype=input.dtype,
+            )  # (B, D, T)
+        state_x_utterance = torch.cat([state, x_utterance], dim=2)
+
+        conv_utterance = self.conv(state_x_utterance)  # (B, D, T_utterance)
+        conv_utterance = conv_utterance.permute(2, 0, 1)
+
+        if self.right_context_length > 0:
+            # (B * num_segments, D, right_context_length + kernel_size - 1)
+            right_context_block = self._split_right_context(state_x_utterance.permute(2, 0, 1), x_right_context)
+            conv_right_context_block = self.conv(right_context_block)  # (B * num_segments, D, right_context_length)
+            # (T_right_context, B, D)
+            conv_right_context = self._merge_right_context(conv_right_context_block, input.size(1))
+            y = torch.cat([conv_right_context, conv_utterance], dim=0)
+        else:
+            y = conv_utterance
+
+        output = self.post_conv(y) + input
+        new_state = state_x_utterance[:, :, -self.state_size :]
+        return output[right_context.size(0) :], output[: right_context.size(0)], new_state
+
+    def infer(
+        self, utterance: torch.Tensor, right_context: torch.Tensor, state: Optional[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        input = torch.cat((utterance, right_context))
+        x = self.pre_conv(input)  # (T, B, D)
+        x = x.permute(1, 2, 0)  # (B, D, T)
+
+        if state is None:
+            state = torch.zeros(
+                input.size(1),
+                input.size(2),
+                self.state_size,
+                device=input.device,
+                dtype=input.dtype,
+            )  # (B, D, T)
+        state_x = torch.cat([state, x], dim=2)
+        conv_out = self.conv(state_x)
+        conv_out = conv_out.permute(2, 0, 1)  # T, B, D
+        output = self.post_conv(conv_out) + input
+        new_state = state_x[:, :, -self.state_size - right_context.size(0) : -right_context.size(0)]
+        return output[: utterance.size(0)], output[utterance.size(0) :], new_state
+
+
+class _ConvEmformerLayer(torch.nn.Module):
+    r"""Convolution-augmented Emformer layer that constitutes ConvEmformer.
+
+    Args:
+        input_dim (int): input dimension.
+        num_heads (int): number of attention heads.
+        ffn_dim: (int): hidden layer dimension of feedforward network.
+        segment_length (int): length of each input segment.
+        kernel_size (int): size of kernel to use in convolution module.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        ffn_activation (str, optional): activation function to use in feedforward network.
+            Must be one of ("relu", "gelu", "silu"). (Default: "relu")
+        left_context_length (int, optional): length of left context. (Default: 0)
+        right_context_length (int, optional): length of right context. (Default: 0)
+        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
+        weight_init_gain (float or None, optional): scale factor to apply when initializing
+            attention module parameters. (Default: ``None``)
+        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
+        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
+        conv_activation (str, optional): activation function to use in convolution module.
+            Must be one of ("relu", "gelu", "silu"). (Default: "silu")
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        segment_length: int,
+        kernel_size: int,
+        dropout: float = 0.0,
+        ffn_activation: str = "relu",
+        left_context_length: int = 0,
+        right_context_length: int = 0,
+        max_memory_size: int = 0,
+        weight_init_gain: Optional[float] = None,
+        tanh_on_mem: bool = False,
+        negative_inf: float = -1e8,
+        conv_activation: str = "silu",
+    ):
+        super().__init__()
+        # TODO: implement talking heads attention.
+        self.attention = _EmformerAttention(
+            input_dim=input_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            weight_init_gain=weight_init_gain,
+            tanh_on_mem=tanh_on_mem,
+            negative_inf=negative_inf,
+        )
+        self.dropout = torch.nn.Dropout(dropout)
+        self.memory_op = torch.nn.AvgPool1d(kernel_size=segment_length, stride=segment_length, ceil_mode=True)
+
+        activation_module = _get_activation_module(ffn_activation)
+        self.ffn0 = _ResidualContainer(
+            torch.nn.Sequential(
+                torch.nn.LayerNorm(input_dim),
+                torch.nn.Linear(input_dim, ffn_dim),
+                activation_module,
+                torch.nn.Dropout(dropout),
+                torch.nn.Linear(ffn_dim, input_dim),
+                torch.nn.Dropout(dropout),
+            ),
+            0.5,
+        )
+        self.ffn1 = _ResidualContainer(
+            torch.nn.Sequential(
+                torch.nn.LayerNorm(input_dim),
+                torch.nn.Linear(input_dim, ffn_dim),
+                activation_module,
+                torch.nn.Dropout(dropout),
+                torch.nn.Linear(ffn_dim, input_dim),
+                torch.nn.Dropout(dropout),
+            ),
+            0.5,
+        )
+        self.layer_norm_input = torch.nn.LayerNorm(input_dim)
+        self.layer_norm_output = torch.nn.LayerNorm(input_dim)
+
+        self.conv = _ConvolutionModule(
+            input_dim=input_dim,
+            kernel_size=kernel_size,
+            activation=conv_activation,
+            dropout=dropout,
+            segment_length=segment_length,
+            right_context_length=right_context_length,
+        )
+
+        self.left_context_length = left_context_length
+        self.segment_length = segment_length
+        self.max_memory_size = max_memory_size
+        self.input_dim = input_dim
+        self.kernel_size = kernel_size
+        self.use_mem = max_memory_size > 0
+
+    def _init_state(self, batch_size: int, device: Optional[torch.device]) -> List[torch.Tensor]:
+        empty_memory = torch.zeros(self.max_memory_size, batch_size, self.input_dim, device=device)
+        left_context_key = torch.zeros(self.left_context_length, batch_size, self.input_dim, device=device)
+        left_context_val = torch.zeros(self.left_context_length, batch_size, self.input_dim, device=device)
+        past_length = torch.zeros(1, batch_size, dtype=torch.int32, device=device)
+        conv_cache = torch.zeros(
+            batch_size,
+            self.input_dim,
+            self.kernel_size - 1,
+            device=device,
+        )
+        return [empty_memory, left_context_key, left_context_val, past_length, conv_cache]
+
+    def _unpack_state(self, state: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        past_length = state[3][0][0].item()
+        past_left_context_length = min(self.left_context_length, past_length)
+        past_mem_length = min(self.max_memory_size, math.ceil(past_length / self.segment_length))
+        pre_mems = state[0][self.max_memory_size - past_mem_length :]
+        lc_key = state[1][self.left_context_length - past_left_context_length :]
+        lc_val = state[2][self.left_context_length - past_left_context_length :]
+        conv_cache = state[4]
+        return pre_mems, lc_key, lc_val, conv_cache
+
+    def _pack_state(
+        self,
+        next_k: torch.Tensor,
+        next_v: torch.Tensor,
+        update_length: int,
+        mems: torch.Tensor,
+        conv_cache: torch.Tensor,
+        state: List[torch.Tensor],
+    ) -> List[torch.Tensor]:
+        new_k = torch.cat([state[1], next_k])
+        new_v = torch.cat([state[2], next_v])
+        state[0] = torch.cat([state[0], mems])[-self.max_memory_size :]
+        state[1] = new_k[new_k.shape[0] - self.left_context_length :]
+        state[2] = new_v[new_v.shape[0] - self.left_context_length :]
+        state[3] = state[3] + update_length
+        state[4] = conv_cache
+        return state
+
+    def _apply_pre_attention(
+        self, utterance: torch.Tensor, right_context: torch.Tensor, summary: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        x = torch.cat([right_context, utterance, summary])
+        ffn0_out = self.ffn0(x)
+        layer_norm_input_out = self.layer_norm_input(ffn0_out)
+        layer_norm_input_right_context, layer_norm_input_utterance, layer_norm_input_summary = (
+            layer_norm_input_out[: right_context.size(0)],
+            layer_norm_input_out[right_context.size(0) : right_context.size(0) + utterance.size(0)],
+            layer_norm_input_out[right_context.size(0) + utterance.size(0) :],
+        )
+        return ffn0_out, layer_norm_input_right_context, layer_norm_input_utterance, layer_norm_input_summary
+
+    def _apply_post_attention(
+        self,
+        rc_output: torch.Tensor,
+        ffn0_out: torch.Tensor,
+        conv_cache: Optional[torch.Tensor],
+        rc_length: int,
+        utterance_length: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        result = self.dropout(rc_output) + ffn0_out[: rc_length + utterance_length]
+        conv_utterance, conv_right_context, conv_cache = self.conv(result[rc_length:], result[:rc_length], conv_cache)
+        result = torch.cat([conv_right_context, conv_utterance])
+        result = self.ffn1(result)
+        result = self.layer_norm_output(result)
+        output_utterance, output_right_context = result[rc_length:], result[:rc_length]
+        return output_utterance, output_right_context, conv_cache
+
+    def forward(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        mems: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        D: feature dimension of each frame;
+        T: number of utterance frames;
+        R: number of right context frames;
+        M: number of memory elements.
+
+        Args:
+            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``utterance``.
+            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
+            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
+            attention_mask (torch.Tensor): attention mask for underlying attention module.
+
+        Returns:
+            (Tensor, Tensor, Tensor):
+                Tensor
+                    encoded utterance frames, with shape `(T, B, D)`.
+                Tensor
+                    updated right context frames, with shape `(R, B, D)`.
+                Tensor
+                    updated memory elements, with shape `(M, B, D)`.
+        """
+        if self.use_mem:
+            summary = self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
+        else:
+            summary = torch.empty(0).to(dtype=utterance.dtype, device=utterance.device)
+
+        (
+            ffn0_out,
+            layer_norm_input_right_context,
+            layer_norm_input_utterance,
+            layer_norm_input_summary,
+        ) = self._apply_pre_attention(utterance, right_context, summary)
+
+        rc_output, output_mems = self.attention(
+            utterance=layer_norm_input_utterance,
+            lengths=lengths,
+            right_context=layer_norm_input_right_context,
+            summary=layer_norm_input_summary,
+            mems=mems,
+            attention_mask=attention_mask,
+        )
+
+        output_utterance, output_right_context, _ = self._apply_post_attention(
+            rc_output, ffn0_out, None, right_context.size(0), utterance.size(0)
+        )
+
+        return output_utterance, output_right_context, output_mems
+
+    @torch.jit.export
+    def infer(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        state: Optional[List[torch.Tensor]],
+        mems: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], torch.Tensor]:
+        r"""Forward pass for inference.
+
+        B: batch size;
+        D: feature dimension of each frame;
+        T: number of utterance frames;
+        R: number of right context frames;
+        M: number of memory elements.
+
+        Args:
+            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``utterance``.
+            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
+            state (List[torch.Tensor] or None): list of tensors representing layer internal state
+                generated in preceding invocation of ``infer``.
+            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
+
+        Returns:
+            (Tensor, Tensor, List[torch.Tensor], Tensor):
+                Tensor
+                    encoded utterance frames, with shape `(T, B, D)`.
+                Tensor
+                    updated right context frames, with shape `(R, B, D)`.
+                List[Tensor]
+                    list of tensors representing layer internal state
+                    generated in current invocation of ``infer``.
+                Tensor
+                    updated memory elements, with shape `(M, B, D)`.
+        """
+        if self.use_mem:
+            summary = self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[:1]
+        else:
+            summary = torch.empty(0).to(dtype=utterance.dtype, device=utterance.device)
+
+        (
+            ffn0_out,
+            layer_norm_input_right_context,
+            layer_norm_input_utterance,
+            layer_norm_input_summary,
+        ) = self._apply_pre_attention(utterance, right_context, summary)
+
+        if state is None:
+            state = self._init_state(layer_norm_input_utterance.size(1), device=layer_norm_input_utterance.device)
+        pre_mems, lc_key, lc_val, conv_cache = self._unpack_state(state)
+
+        rc_output, next_m, next_k, next_v = self.attention.infer(
+            utterance=layer_norm_input_utterance,
+            lengths=lengths,
+            right_context=layer_norm_input_right_context,
+            summary=layer_norm_input_summary,
+            mems=pre_mems,
+            left_context_key=lc_key,
+            left_context_val=lc_val,
+        )
+
+        output_utterance, output_right_context, conv_cache = self._apply_post_attention(
+            rc_output, ffn0_out, conv_cache, right_context.size(0), utterance.size(0)
+        )
+        output_state = self._pack_state(next_k, next_v, utterance.size(0), mems, conv_cache, state)
+        return output_utterance, output_right_context, output_state, next_m
+
+
+class ConvEmformer(_EmformerImpl):
+    r"""Implements the convolution-augmented streaming transformer architecture introduced in
+    *Streaming Transformer Transducer based Speech Recognition Using Non-Causal Convolution*
+    :cite:`9747706`.
+
+    Args:
+        input_dim (int): input dimension.
+        num_heads (int): number of attention heads in each ConvEmformer layer.
+        ffn_dim (int): hidden layer dimension of each ConvEmformer layer's feedforward network.
+        num_layers (int): number of ConvEmformer layers to instantiate.
+        segment_length (int): length of each input segment.
+        kernel_size (int): size of kernel to use in convolution modules.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        ffn_activation (str, optional): activation function to use in feedforward networks.
+            Must be one of ("relu", "gelu", "silu"). (Default: "relu")
+        left_context_length (int, optional): length of left context. (Default: 0)
+        right_context_length (int, optional): length of right context. (Default: 0)
+        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
+        weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling
+            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
+        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
+        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
+        conv_activation (str, optional): activation function to use in convolution modules.
+            Must be one of ("relu", "gelu", "silu"). (Default: "silu")
+
+    Examples:
+        >>> conv_emformer = ConvEmformer(80, 4, 1024, 12, 16, 8, right_context_length=4)
+        >>> input = torch.rand(10, 200, 80)
+        >>> lengths = torch.randint(1, 200, (10,))
+        >>> output, lengths = conv_emformer(input, lengths)
+        >>> input = torch.rand(4, 20, 80)
+        >>> lengths = torch.ones(4) * 20
+        >>> output, lengths, states = conv_emformer.infer(input, lengths, None)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        num_layers: int,
+        segment_length: int,
+        kernel_size: int,
+        dropout: float = 0.0,
+        ffn_activation: str = "relu",
+        left_context_length: int = 0,
+        right_context_length: int = 0,
+        max_memory_size: int = 0,
+        weight_init_scale_strategy: Optional[str] = "depthwise",
+        tanh_on_mem: bool = False,
+        negative_inf: float = -1e8,
+        conv_activation: str = "silu",
+    ):
+        weight_init_gains = _get_weight_init_gains(weight_init_scale_strategy, num_layers)
+        emformer_layers = torch.nn.ModuleList(
+            [
+                _ConvEmformerLayer(
+                    input_dim,
+                    num_heads,
+                    ffn_dim,
+                    segment_length,
+                    kernel_size,
+                    dropout=dropout,
+                    ffn_activation=ffn_activation,
+                    left_context_length=left_context_length,
+                    right_context_length=right_context_length,
+                    max_memory_size=max_memory_size,
+                    weight_init_gain=weight_init_gains[layer_idx],
+                    tanh_on_mem=tanh_on_mem,
+                    negative_inf=negative_inf,
+                    conv_activation=conv_activation,
+                )
+                for layer_idx in range(num_layers)
+            ]
+        )
+        super().__init__(
+            emformer_layers,
+            segment_length,
+            left_context_length=left_context_length,
+            right_context_length=right_context_length,
+            max_memory_size=max_memory_size,
+        )
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/hifi_gan.py b/MLPY/Lib/site-packages/torchaudio/prototype/models/hifi_gan.py
new file mode 100644
index 0000000000000000000000000000000000000000..1db30eaec0345deba321b17bbeea331a793963e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/models/hifi_gan.py
@@ -0,0 +1,336 @@
+"""
+MIT License
+
+Copyright (c) 2020 Jungil Kong
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d, ConvTranspose1d
+
+
+class HiFiGANVocoder(torch.nn.Module):
+    """Generator part of *HiFi GAN* :cite:`NEURIPS2020_c5d73680`.
+    Source: https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/models.py#L75
+
+    Note:
+        To build the model, please use one of the factory functions: :py:func:`hifigan_vocoder`,
+        :py:func:`hifigan_vocoder_v1`, :py:func:`hifigan_vocoder_v2`, :py:func:`hifigan_vocoder_v3`.
+
+    Args:
+        in_channels (int): Number of channels in the input features.
+        upsample_rates (tuple of ``int``): Factors by which each upsampling layer increases the time dimension.
+        upsample_initial_channel (int): Number of channels in the input feature tensor.
+        upsample_kernel_sizes (tuple of ``int``): Kernel size for each upsampling layer.
+        resblock_kernel_sizes (tuple of ``int``): Kernel size for each residual block.
+        resblock_dilation_sizes (tuple of tuples of ``int``): Dilation sizes for each 1D convolutional layer in each
+            residual block. For resblock type 1 inner tuples should have length 3, because there are 3
+            convolutions in each layer. For resblock type 2 they should have length 2.
+        resblock_type (int, 1 or 2): Determines whether ``ResBlock1`` or ``ResBlock2`` will be used.
+        lrelu_slope (float): Slope of leaky ReLUs in activations.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        upsample_rates: Tuple[int, ...],
+        upsample_initial_channel: int,
+        upsample_kernel_sizes: Tuple[int, ...],
+        resblock_kernel_sizes: Tuple[int, ...],
+        resblock_dilation_sizes: Tuple[Tuple[int, ...], ...],
+        resblock_type: int,
+        lrelu_slope: float,
+    ):
+        super(HiFiGANVocoder, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)
+        resblock = ResBlock1 if resblock_type == 1 else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                ConvTranspose1d(
+                    upsample_initial_channel // (2**i),
+                    upsample_initial_channel // (2 ** (i + 1)),
+                    k,
+                    u,
+                    padding=(k - u) // 2,
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for (k, d) in zip(resblock_kernel_sizes, resblock_dilation_sizes):
+                self.resblocks.append(resblock(ch, k, d, lrelu_slope))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3)
+        self.lrelu_slope = lrelu_slope
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Feature input tensor of shape `(batch_size, num_channels, time_length)`.
+
+        Returns:
+            Tensor of shape `(batch_size, 1, time_length * upsample_rate)`, where `upsample_rate` is the product
+            of upsample rates for all layers.
+        """
+        x = self.conv_pre(x)
+        for i, upsampling_layer in enumerate(self.ups):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = upsampling_layer(x)
+            xs = torch.zeros_like(x)
+            for j in range(self.num_kernels):
+                res_block: ResBlockInterface = self.resblocks[i * self.num_kernels + j]
+                xs += res_block.forward(x)
+            x = xs / self.num_kernels
+
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+
+@torch.jit.interface
+class ResBlockInterface(torch.nn.Module):
+    """Interface for ResBlock - necessary to make type annotations in ``HiFiGANVocoder.forward`` compatible
+    with TorchScript
+    """
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pass
+
+
+class ResBlock1(torch.nn.Module):
+    """Residual block of type 1 for HiFiGAN Vocoder :cite:`NEURIPS2020_c5d73680`.
+    Args:
+        channels (int): Number of channels in the input features.
+        kernel_size (int, optional): Kernel size for 1D convolutions. (Default: ``3``)
+        dilation (tuple of 3 ``int``, optional): Dilations for each 1D convolution. (Default: ``(1, 3, 5)``)
+        lrelu_slope (float): Slope of leaky ReLUs in activations.
+    """
+
+    def __init__(
+        self, channels: int, kernel_size: int = 3, dilation: Tuple[int, int, int] = (1, 3, 5), lrelu_slope: float = 0.1
+    ):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]),
+                ),
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1]),
+                ),
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding=get_padding(kernel_size, dilation[2]),
+                ),
+            ]
+        )
+
+        self.convs2 = nn.ModuleList(
+            [
+                Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)),
+                Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)),
+                Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)),
+            ]
+        )
+        self.lrelu_slope = lrelu_slope
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): input of shape ``(batch_size, channels, time_length)``.
+        Returns:
+            Tensor of the same shape as input.
+        """
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, self.lrelu_slope)
+            xt = conv1(xt)
+            xt = F.leaky_relu(xt, self.lrelu_slope)
+            xt = conv2(xt)
+            x = xt + x
+        return x
+
+
+class ResBlock2(torch.nn.Module):
+    """Residual block of type 2 for HiFiGAN Vocoder :cite:`NEURIPS2020_c5d73680`.
+    Args:
+        channels (int): Number of channels in the input features.
+        kernel_size (int, optional): Kernel size for 1D convolutions. (Default: ``3``)
+        dilation (tuple of 2 ``int``, optional): Dilations for each 1D convolution. (Default: ``(1, 3)``)
+        lrelu_slope (float): Slope of leaky ReLUs in activations.
+    """
+
+    def __init__(
+        self, channels: int, kernel_size: int = 3, dilation: Tuple[int, int] = (1, 3), lrelu_slope: float = 0.1
+    ):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList(
+            [
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]),
+                ),
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1]),
+                ),
+            ]
+        )
+        self.lrelu_slope = lrelu_slope
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (Tensor): input of shape ``(batch_size, channels, time_length)``.
+        Returns:
+            Tensor of the same shape as input.
+        """
+        for c in self.convs:
+            xt = F.leaky_relu(x, self.lrelu_slope)
+            xt = c(xt)
+            x = xt + x
+        return x
+
+
+def get_padding(kernel_size, dilation=1):
+    """Find padding for which 1D convolution preserves the input shape."""
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+def hifigan_vocoder(
+    in_channels: int,
+    upsample_rates: Tuple[int, ...],
+    upsample_initial_channel: int,
+    upsample_kernel_sizes: Tuple[int, ...],
+    resblock_kernel_sizes: Tuple[int, ...],
+    resblock_dilation_sizes: Tuple[Tuple[int, ...], ...],
+    resblock_type: int,
+    lrelu_slope: float,
+) -> HiFiGANVocoder:
+    r"""Builds HiFi GAN Vocoder :cite:`NEURIPS2020_c5d73680`.
+
+    Args:
+        in_channels (int): See :py:class:`HiFiGANVocoder`.
+        upsample_rates (tuple of ``int``): See :py:class:`HiFiGANVocoder`.
+        upsample_initial_channel (int): See :py:class:`HiFiGANVocoder`.
+        upsample_kernel_sizes (tuple of ``int``): See :py:class:`HiFiGANVocoder`.
+        resblock_kernel_sizes (tuple of ``int``): See :py:class:`HiFiGANVocoder`.
+        resblock_dilation_sizes (tuple of tuples of ``int``): See :py:class:`HiFiGANVocoder`.
+        resblock_type (int, 1 or 2): See :py:class:`HiFiGANVocoder`.
+    Returns:
+        HiFiGANVocoder: generated model.
+    """
+
+    return HiFiGANVocoder(
+        upsample_rates=upsample_rates,
+        resblock_kernel_sizes=resblock_kernel_sizes,
+        resblock_dilation_sizes=resblock_dilation_sizes,
+        resblock_type=resblock_type,
+        upsample_initial_channel=upsample_initial_channel,
+        upsample_kernel_sizes=upsample_kernel_sizes,
+        in_channels=in_channels,
+        lrelu_slope=lrelu_slope,
+    )
+
+
+def hifigan_vocoder_v1() -> HiFiGANVocoder:
+    r"""Builds HiFiGAN Vocoder with V1 architecture :cite:`NEURIPS2020_c5d73680`.
+
+    Returns:
+        HiFiGANVocoder: generated model.
+    """
+    return hifigan_vocoder(
+        upsample_rates=(8, 8, 2, 2),
+        upsample_kernel_sizes=(16, 16, 4, 4),
+        upsample_initial_channel=512,
+        resblock_kernel_sizes=(3, 7, 11),
+        resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5)),
+        resblock_type=1,
+        in_channels=80,
+        lrelu_slope=0.1,
+    )
+
+
+def hifigan_vocoder_v2() -> HiFiGANVocoder:
+    r"""Builds HiFiGAN Vocoder with V2 architecture :cite:`NEURIPS2020_c5d73680`.
+
+    Returns:
+        HiFiGANVocoder: generated model.
+    """
+    return hifigan_vocoder(
+        upsample_rates=(8, 8, 2, 2),
+        upsample_kernel_sizes=(16, 16, 4, 4),
+        upsample_initial_channel=128,
+        resblock_kernel_sizes=(3, 7, 11),
+        resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5)),
+        resblock_type=1,
+        in_channels=80,
+        lrelu_slope=0.1,
+    )
+
+
+def hifigan_vocoder_v3() -> HiFiGANVocoder:
+    r"""Builds HiFiGAN Vocoder with V3 architecture :cite:`NEURIPS2020_c5d73680`.
+
+    Returns:
+        HiFiGANVocoder: generated model.
+    """
+    return hifigan_vocoder(
+        upsample_rates=(8, 8, 4),
+        upsample_kernel_sizes=(16, 16, 8),
+        upsample_initial_channel=256,
+        resblock_kernel_sizes=(3, 5, 7),
+        resblock_dilation_sizes=((1, 2), (2, 6), (3, 12)),
+        resblock_type=2,
+        in_channels=80,
+        lrelu_slope=0.1,
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/rnnt.py b/MLPY/Lib/site-packages/torchaudio/prototype/models/rnnt.py
new file mode 100644
index 0000000000000000000000000000000000000000..18a620f76052fb38024641d62598df062e1a94ab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/models/rnnt.py
@@ -0,0 +1,711 @@
+import math
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torchaudio.models import Conformer, RNNT
+from torchaudio.models.rnnt import _Joiner, _Predictor, _TimeReduction, _Transcriber
+
+
+TrieNode = Tuple[Dict[int, "TrieNode"], int, Optional[Tuple[int, int]]]
+
+
+class _ConformerEncoder(torch.nn.Module, _Transcriber):
+    def __init__(
+        self,
+        *,
+        input_dim: int,
+        output_dim: int,
+        time_reduction_stride: int,
+        conformer_input_dim: int,
+        conformer_ffn_dim: int,
+        conformer_num_layers: int,
+        conformer_num_heads: int,
+        conformer_depthwise_conv_kernel_size: int,
+        conformer_dropout: float,
+    ) -> None:
+        super().__init__()
+        self.time_reduction = _TimeReduction(time_reduction_stride)
+        self.input_linear = torch.nn.Linear(input_dim * time_reduction_stride, conformer_input_dim)
+        self.conformer = Conformer(
+            num_layers=conformer_num_layers,
+            input_dim=conformer_input_dim,
+            ffn_dim=conformer_ffn_dim,
+            num_heads=conformer_num_heads,
+            depthwise_conv_kernel_size=conformer_depthwise_conv_kernel_size,
+            dropout=conformer_dropout,
+            use_group_norm=True,
+            convolution_first=True,
+        )
+        self.output_linear = torch.nn.Linear(conformer_input_dim, output_dim)
+        self.layer_norm = torch.nn.LayerNorm(output_dim)
+
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        time_reduction_out, time_reduction_lengths = self.time_reduction(input, lengths)
+        input_linear_out = self.input_linear(time_reduction_out)
+        x, lengths = self.conformer(input_linear_out, time_reduction_lengths)
+        output_linear_out = self.output_linear(x)
+        layer_norm_out = self.layer_norm(output_linear_out)
+        return layer_norm_out, lengths
+
+    def infer(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        states: Optional[List[List[torch.Tensor]]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        raise RuntimeError("Conformer does not support streaming inference.")
+
+
+class _JoinerBiasing(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+        biasing (bool): perform biasing
+        deepbiasing (bool): perform deep biasing
+        attndim (int): dimension of the biasing vector hptr
+
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        activation: str = "relu",
+        biasing: bool = False,
+        deepbiasing: bool = False,
+        attndim: int = 1,
+    ) -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        self.biasing = biasing
+        self.deepbiasing = deepbiasing
+        if self.biasing and self.deepbiasing:
+            self.biasinglinear = torch.nn.Linear(attndim, input_dim, bias=True)
+            self.attndim = attndim
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+        hptr: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+            hptr (torch.Tensor): deep biasing vector with shape `(B, T, U, A)`.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+                torch.Tensor
+                    joint network second last layer output (i.e. before self.linear), with shape `(B, T, U, D)`.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        if self.biasing and self.deepbiasing and hptr is not None:
+            hptr = self.biasinglinear(hptr)
+            joint_encodings += hptr
+        elif self.biasing and self.deepbiasing:
+            # Hack here for unused parameters
+            joint_encodings += self.biasinglinear(joint_encodings.new_zeros(1, self.attndim)).mean() * 0
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths, activation_out
+
+
+class RNNTBiasing(RNNT):
+    r"""torchaudio.models.RNNT()
+
+    Recurrent neural network transducer (RNN-T) model.
+
+    Note:
+        To build the model, please use one of the factory functions.
+
+    Args:
+        transcriber (torch.nn.Module): transcription network.
+        predictor (torch.nn.Module): prediction network.
+        joiner (torch.nn.Module): joint network.
+        attndim (int): TCPGen attention dimension
+        biasing (bool): If true, use biasing, otherwise use standard RNN-T
+        deepbiasing (bool): If true, use deep biasing by extracting the biasing vector
+        embdim (int): dimension of symbol embeddings
+        jointdim (int): dimension of the joint network joint dimension
+        charlist (list): The list of word piece tokens in the same order as the output layer
+        encoutdim (int): dimension of the encoder output vectors
+        dropout_tcpgen (float): dropout rate for TCPGen
+        tcpsche (int): The epoch at which TCPGen starts to train
+        DBaverage (bool): If true, instead of TCPGen, use DBRNNT for biasing
+    """
+
+    def __init__(
+        self,
+        transcriber: _Transcriber,
+        predictor: _Predictor,
+        joiner: _Joiner,
+        attndim: int,
+        biasing: bool,
+        deepbiasing: bool,
+        embdim: int,
+        jointdim: int,
+        charlist: List[str],
+        encoutdim: int,
+        dropout_tcpgen: float,
+        tcpsche: int,
+        DBaverage: bool,
+    ) -> None:
+        super().__init__(transcriber, predictor, joiner)
+        self.attndim = attndim
+        self.deepbiasing = deepbiasing
+        self.jointdim = jointdim
+        self.embdim = embdim
+        self.encoutdim = encoutdim
+        self.char_list = charlist or []
+        self.blank_idx = self.char_list.index("<blank>")
+        self.nchars = len(self.char_list)
+        self.DBaverage = DBaverage
+        self.biasing = biasing
+        if self.biasing:
+            if self.deepbiasing and self.DBaverage:
+                # Deep biasing without TCPGen
+                self.biasingemb = torch.nn.Linear(self.nchars, self.attndim, bias=False)
+            else:
+                # TCPGen parameters
+                self.ooKBemb = torch.nn.Embedding(1, self.embdim)
+                self.Qproj_char = torch.nn.Linear(self.embdim, self.attndim)
+                self.Qproj_acoustic = torch.nn.Linear(self.encoutdim, self.attndim)
+                self.Kproj = torch.nn.Linear(self.embdim, self.attndim)
+                self.pointer_gate = torch.nn.Linear(self.attndim + self.jointdim, 1)
+        self.dropout_tcpgen = torch.nn.Dropout(dropout_tcpgen)
+        self.tcpsche = tcpsche
+
+    def forward(
+        self,
+        sources: torch.Tensor,
+        source_lengths: torch.Tensor,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+        tries: TrieNode,
+        current_epoch: int,
+        predictor_state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]], torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: feature dimension of each source sequence element.
+
+        Args:
+            sources (torch.Tensor): source frame sequences right-padded with right context, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``sources``.
+            targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``targets``.
+            tries (TrieNode): wordpiece prefix trees representing the biasing list to be searched
+            current_epoch (Int): the current epoch number to determine if TCPGen should be trained
+                at this epoch
+            predictor_state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing prediction network internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    joint network output, with shape
+                    `(B, max output source length, max output target length, output_dim (number of target symbols))`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing prediction network internal state generated in current invocation
+                    of ``forward``.
+                torch.Tensor
+                    TCPGen distribution, with shape
+                    `(B, max output source length, max output target length, output_dim (number of target symbols))`.
+                torch.Tensor
+                    Generation probability (or copy probability), with shape
+                    `(B, max output source length, max output target length, 1)`.
+        """
+        source_encodings, source_lengths = self.transcriber(
+            input=sources,
+            lengths=source_lengths,
+        )
+        target_encodings, target_lengths, predictor_state = self.predictor(
+            input=targets,
+            lengths=target_lengths,
+            state=predictor_state,
+        )
+        # Forward TCPGen
+        hptr = None
+        tcpgen_dist, p_gen = None, None
+        if self.biasing and current_epoch >= self.tcpsche and tries != []:
+            ptrdist_mask, p_gen_mask = self.get_tcpgen_step_masks(targets, tries)
+            hptr, tcpgen_dist = self.forward_tcpgen(targets, ptrdist_mask, source_encodings)
+            hptr = self.dropout_tcpgen(hptr)
+        elif self.biasing:
+            # Hack here to bypass unused parameters
+            if self.DBaverage and self.deepbiasing:
+                dummy = self.biasingemb(source_encodings.new_zeros(1, len(self.char_list))).mean()
+            else:
+                dummy = source_encodings.new_zeros(1, self.embdim)
+                dummy = self.Qproj_char(dummy).mean()
+                dummy += self.Qproj_acoustic(source_encodings.new_zeros(1, source_encodings.size(-1))).mean()
+                dummy += self.Kproj(source_encodings.new_zeros(1, self.embdim)).mean()
+                dummy += self.pointer_gate(source_encodings.new_zeros(1, self.attndim + self.jointdim)).mean()
+                dummy += self.ooKBemb.weight.mean()
+            dummy = dummy * 0
+            source_encodings += dummy
+
+        output, source_lengths, target_lengths, jointer_activation = self.joiner(
+            source_encodings=source_encodings,
+            source_lengths=source_lengths,
+            target_encodings=target_encodings,
+            target_lengths=target_lengths,
+            hptr=hptr,
+        )
+
+        # Calculate Generation Probability
+        if self.biasing and hptr is not None and tcpgen_dist is not None:
+            p_gen = torch.sigmoid(self.pointer_gate(torch.cat((jointer_activation, hptr), dim=-1)))
+            # avoid collapsing to ooKB token in the first few updates
+            # if current_epoch == self.tcpsche:
+            #     p_gen = p_gen * 0.1
+            p_gen = p_gen.masked_fill(p_gen_mask.bool().unsqueeze(1).unsqueeze(-1), 0)
+
+        return (output, source_lengths, target_lengths, predictor_state, tcpgen_dist, p_gen)
+
+    def get_tcpgen_distribution(self, query, ptrdist_mask):
+        # Make use of the predictor embedding matrix
+        keyvalues = torch.cat([self.predictor.embedding.weight.data, self.ooKBemb.weight], dim=0)
+        keyvalues = self.dropout_tcpgen(self.Kproj(keyvalues))
+        # B * T * U * attndim, nbpe * attndim -> B * T * U * nbpe
+        tcpgendist = torch.einsum("ntuj,ij->ntui", query, keyvalues)
+        tcpgendist = tcpgendist / math.sqrt(query.size(-1))
+        ptrdist_mask = ptrdist_mask.unsqueeze(1).repeat(1, tcpgendist.size(1), 1, 1)
+        tcpgendist.masked_fill_(ptrdist_mask.bool(), -1e9)
+        tcpgendist = torch.nn.functional.softmax(tcpgendist, dim=-1)
+        # B * T * U * nbpe, nbpe * attndim -> B * T * U * attndim
+        hptr = torch.einsum("ntui,ij->ntuj", tcpgendist[:, :, :, :-1], keyvalues[:-1, :])
+        return hptr, tcpgendist
+
+    def forward_tcpgen(self, targets, ptrdist_mask, source_encodings):
+        tcpgen_dist = None
+        if self.DBaverage and self.deepbiasing:
+            hptr = self.biasingemb(1 - ptrdist_mask[:, :, :-1].float()).unsqueeze(1)
+        else:
+            query_char = self.predictor.embedding(targets)
+            query_char = self.Qproj_char(query_char).unsqueeze(1)  # B * 1 * U * attndim
+            query_acoustic = self.Qproj_acoustic(source_encodings).unsqueeze(2)  # B * T * 1 * attndim
+            query = query_char + query_acoustic  # B * T * U * attndim
+            hptr, tcpgen_dist = self.get_tcpgen_distribution(query, ptrdist_mask)
+        return hptr, tcpgen_dist
+
+    def get_tcpgen_step_masks(self, yseqs, resettrie):
+        seqlen = len(yseqs[0])
+        batch_masks = yseqs.new_ones(len(yseqs), seqlen, len(self.char_list) + 1)
+        p_gen_masks = []
+        for i, yseq in enumerate(yseqs):
+            new_tree = resettrie
+            p_gen_mask = []
+            for j, vy in enumerate(yseq):
+                vy = vy.item()
+                new_tree = new_tree[0]
+                if vy in [self.blank_idx]:
+                    new_tree = resettrie
+                    p_gen_mask.append(0)
+                elif self.char_list[vy].endswith("▁"):
+                    if vy in new_tree and new_tree[vy][0] != {}:
+                        new_tree = new_tree[vy]
+                    else:
+                        new_tree = resettrie
+                    p_gen_mask.append(0)
+                elif vy not in new_tree:
+                    new_tree = [{}]
+                    p_gen_mask.append(1)
+                else:
+                    new_tree = new_tree[vy]
+                    p_gen_mask.append(0)
+                batch_masks[i, j, list(new_tree[0].keys())] = 0
+                # In the original paper, ooKB node was not masked
+                # In this implementation, if not masking ooKB, ooKB probability
+                # would quickly collapse to 1.0 in the first few updates.
+                # Haven't found out why this happened.
+                # batch_masks[i, j, -1] = 0
+            p_gen_masks.append(p_gen_mask + [1] * (seqlen - len(p_gen_mask)))
+        p_gen_masks = torch.Tensor(p_gen_masks).to(yseqs.device).byte()
+        return batch_masks, p_gen_masks
+
+    def get_tcpgen_step_masks_prefix(self, yseqs, resettrie):
+        # Implemented for prefix-based wordpieces, not tested yet
+        seqlen = len(yseqs[0])
+        batch_masks = yseqs.new_ones(len(yseqs), seqlen, len(self.char_list) + 1)
+        p_gen_masks = []
+        for i, yseq in enumerate(yseqs):
+            p_gen_mask = []
+            new_tree = resettrie
+            for j, vy in enumerate(yseq):
+                vy = vy.item()
+                new_tree = new_tree[0]
+                if vy in [self.blank_idx]:
+                    new_tree = resettrie
+                    batch_masks[i, j, list(new_tree[0].keys())] = 0
+                elif self.char_list[vy].startswith("▁"):
+                    new_tree = resettrie
+                    if vy not in new_tree[0]:
+                        batch_masks[i, j, list(new_tree[0].keys())] = 0
+                    else:
+                        new_tree = new_tree[0][vy]
+                        batch_masks[i, j, list(new_tree[0].keys())] = 0
+                        if new_tree[1] != -1:
+                            batch_masks[i, j, list(resettrie[0].keys())] = 0
+                else:
+                    if vy not in new_tree:
+                        new_tree = resettrie
+                        batch_masks[i, j, list(new_tree[0].keys())] = 0
+                    else:
+                        new_tree = new_tree[vy]
+                        batch_masks[i, j, list(new_tree[0].keys())] = 0
+                        if new_tree[1] != -1:
+                            batch_masks[i, j, list(resettrie[0].keys())] = 0
+                p_gen_mask.append(0)
+                # batch_masks[i, j, -1] = 0
+            p_gen_masks.append(p_gen_mask + [1] * (seqlen - len(p_gen_mask)))
+        p_gen_masks = torch.Tensor(p_gen_masks).to(yseqs.device).byte()
+
+        return batch_masks, p_gen_masks
+
+    def get_tcpgen_step(self, vy, trie, resettrie):
+        new_tree = trie[0]
+        if vy in [self.blank_idx]:
+            new_tree = resettrie
+        elif self.char_list[vy].endswith("▁"):
+            if vy in new_tree and new_tree[vy][0] != {}:
+                new_tree = new_tree[vy]
+            else:
+                new_tree = resettrie
+        elif vy not in new_tree:
+            new_tree = [{}]
+        else:
+            new_tree = new_tree[vy]
+        return new_tree
+
+    def join(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+        hptr: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Applies joint network to source and target encodings.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+        A: TCPGen attention dimension
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+            hptr (torch.Tensor): deep biasing vector with shape `(B, T, U, A)`.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    joint network second last layer output, with shape `(B, T, U, D)`.
+        """
+        output, source_lengths, target_lengths, jointer_activation = self.joiner(
+            source_encodings=source_encodings,
+            source_lengths=source_lengths,
+            target_encodings=target_encodings,
+            target_lengths=target_lengths,
+            hptr=hptr,
+        )
+        return output, source_lengths, jointer_activation
+
+
+def conformer_rnnt_model(
+    *,
+    input_dim: int,
+    encoding_dim: int,
+    time_reduction_stride: int,
+    conformer_input_dim: int,
+    conformer_ffn_dim: int,
+    conformer_num_layers: int,
+    conformer_num_heads: int,
+    conformer_depthwise_conv_kernel_size: int,
+    conformer_dropout: float,
+    num_symbols: int,
+    symbol_embedding_dim: int,
+    num_lstm_layers: int,
+    lstm_hidden_dim: int,
+    lstm_layer_norm: int,
+    lstm_layer_norm_epsilon: int,
+    lstm_dropout: int,
+    joiner_activation: str,
+) -> RNNT:
+    r"""Builds Conformer-based recurrent neural network transducer (RNN-T) model.
+
+    Args:
+        input_dim (int): dimension of input sequence frames passed to transcription network.
+        encoding_dim (int): dimension of transcription- and prediction-network-generated encodings
+            passed to joint network.
+        time_reduction_stride (int): factor by which to reduce length of input sequence.
+        conformer_input_dim (int): dimension of Conformer input.
+        conformer_ffn_dim (int): hidden layer dimension of each Conformer layer's feedforward network.
+        conformer_num_layers (int): number of Conformer layers to instantiate.
+        conformer_num_heads (int): number of attention heads in each Conformer layer.
+        conformer_depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
+        conformer_dropout (float): Conformer dropout probability.
+        num_symbols (int): cardinality of set of target tokens.
+        symbol_embedding_dim (int): dimension of each target token embedding.
+        num_lstm_layers (int): number of LSTM layers to instantiate.
+        lstm_hidden_dim (int): output dimension of each LSTM layer.
+        lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers.
+        lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers.
+        lstm_dropout (float): LSTM dropout probability.
+        joiner_activation (str): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+        Returns:
+            RNNT:
+                Conformer RNN-T model.
+    """
+    encoder = _ConformerEncoder(
+        input_dim=input_dim,
+        output_dim=encoding_dim,
+        time_reduction_stride=time_reduction_stride,
+        conformer_input_dim=conformer_input_dim,
+        conformer_ffn_dim=conformer_ffn_dim,
+        conformer_num_layers=conformer_num_layers,
+        conformer_num_heads=conformer_num_heads,
+        conformer_depthwise_conv_kernel_size=conformer_depthwise_conv_kernel_size,
+        conformer_dropout=conformer_dropout,
+    )
+    predictor = _Predictor(
+        num_symbols=num_symbols,
+        output_dim=encoding_dim,
+        symbol_embedding_dim=symbol_embedding_dim,
+        num_lstm_layers=num_lstm_layers,
+        lstm_hidden_dim=lstm_hidden_dim,
+        lstm_layer_norm=lstm_layer_norm,
+        lstm_layer_norm_epsilon=lstm_layer_norm_epsilon,
+        lstm_dropout=lstm_dropout,
+    )
+    joiner = _Joiner(encoding_dim, num_symbols, activation=joiner_activation)
+    return RNNT(encoder, predictor, joiner)
+
+
+def conformer_rnnt_base() -> RNNT:
+    r"""Builds basic version of Conformer RNN-T model.
+
+    Returns:
+        RNNT:
+            Conformer RNN-T model.
+    """
+    return conformer_rnnt_model(
+        input_dim=80,
+        encoding_dim=1024,
+        time_reduction_stride=4,
+        conformer_input_dim=256,
+        conformer_ffn_dim=1024,
+        conformer_num_layers=16,
+        conformer_num_heads=4,
+        conformer_depthwise_conv_kernel_size=31,
+        conformer_dropout=0.1,
+        num_symbols=1024,
+        symbol_embedding_dim=256,
+        num_lstm_layers=2,
+        lstm_hidden_dim=512,
+        lstm_layer_norm=True,
+        lstm_layer_norm_epsilon=1e-5,
+        lstm_dropout=0.3,
+        joiner_activation="tanh",
+    )
+
+
+def conformer_rnnt_biasing(
+    *,
+    input_dim: int,
+    encoding_dim: int,
+    time_reduction_stride: int,
+    conformer_input_dim: int,
+    conformer_ffn_dim: int,
+    conformer_num_layers: int,
+    conformer_num_heads: int,
+    conformer_depthwise_conv_kernel_size: int,
+    conformer_dropout: float,
+    num_symbols: int,
+    symbol_embedding_dim: int,
+    num_lstm_layers: int,
+    lstm_hidden_dim: int,
+    lstm_layer_norm: int,
+    lstm_layer_norm_epsilon: int,
+    lstm_dropout: int,
+    joiner_activation: str,
+    attndim: int,
+    biasing: bool,
+    charlist: List[str],
+    deepbiasing: bool,
+    tcpsche: int,
+    DBaverage: bool,
+) -> RNNTBiasing:
+    r"""Builds Conformer-based recurrent neural network transducer (RNN-T) model.
+
+    Args:
+        input_dim (int): dimension of input sequence frames passed to transcription network.
+        encoding_dim (int): dimension of transcription- and prediction-network-generated encodings
+            passed to joint network.
+        time_reduction_stride (int): factor by which to reduce length of input sequence.
+        conformer_input_dim (int): dimension of Conformer input.
+        conformer_ffn_dim (int): hidden layer dimension of each Conformer layer's feedforward network.
+        conformer_num_layers (int): number of Conformer layers to instantiate.
+        conformer_num_heads (int): number of attention heads in each Conformer layer.
+        conformer_depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
+        conformer_dropout (float): Conformer dropout probability.
+        num_symbols (int): cardinality of set of target tokens.
+        symbol_embedding_dim (int): dimension of each target token embedding.
+        num_lstm_layers (int): number of LSTM layers to instantiate.
+        lstm_hidden_dim (int): output dimension of each LSTM layer.
+        lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers.
+        lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers.
+        lstm_dropout (float): LSTM dropout probability.
+        joiner_activation (str): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+        attndim (int): TCPGen attention dimension
+        biasing (bool): If true, use biasing, otherwise use standard RNN-T
+        charlist (list): The list of word piece tokens in the same order as the output layer
+        deepbiasing (bool): If true, use deep biasing by extracting the biasing vector
+        tcpsche (int): The epoch at which TCPGen starts to train
+        DBaverage (bool): If true, instead of TCPGen, use DBRNNT for biasing
+
+        Returns:
+            RNNT:
+                Conformer RNN-T model with TCPGen-based biasing support.
+    """
+    encoder = _ConformerEncoder(
+        input_dim=input_dim,
+        output_dim=encoding_dim,
+        time_reduction_stride=time_reduction_stride,
+        conformer_input_dim=conformer_input_dim,
+        conformer_ffn_dim=conformer_ffn_dim,
+        conformer_num_layers=conformer_num_layers,
+        conformer_num_heads=conformer_num_heads,
+        conformer_depthwise_conv_kernel_size=conformer_depthwise_conv_kernel_size,
+        conformer_dropout=conformer_dropout,
+    )
+    predictor = _Predictor(
+        num_symbols=num_symbols,
+        output_dim=encoding_dim,
+        symbol_embedding_dim=symbol_embedding_dim,
+        num_lstm_layers=num_lstm_layers,
+        lstm_hidden_dim=lstm_hidden_dim,
+        lstm_layer_norm=lstm_layer_norm,
+        lstm_layer_norm_epsilon=lstm_layer_norm_epsilon,
+        lstm_dropout=lstm_dropout,
+    )
+    joiner = _JoinerBiasing(
+        encoding_dim,
+        num_symbols,
+        activation=joiner_activation,
+        deepbiasing=deepbiasing,
+        attndim=attndim,
+        biasing=biasing,
+    )
+    return RNNTBiasing(
+        encoder,
+        predictor,
+        joiner,
+        attndim,
+        biasing,
+        deepbiasing,
+        symbol_embedding_dim,
+        encoding_dim,
+        charlist,
+        encoding_dim,
+        conformer_dropout,
+        tcpsche,
+        DBaverage,
+    )
+
+
+def conformer_rnnt_biasing_base(charlist=None, biasing=True) -> RNNT:
+    r"""Builds basic version of Conformer RNN-T model with TCPGen.
+
+    Returns:
+        RNNT:
+            Conformer RNN-T model with TCPGen-based biasing support.
+    """
+    return conformer_rnnt_biasing(
+        input_dim=80,
+        encoding_dim=576,
+        time_reduction_stride=4,
+        conformer_input_dim=144,
+        conformer_ffn_dim=576,
+        conformer_num_layers=16,
+        conformer_num_heads=4,
+        conformer_depthwise_conv_kernel_size=31,
+        conformer_dropout=0.1,
+        num_symbols=601,
+        symbol_embedding_dim=256,
+        num_lstm_layers=1,
+        lstm_hidden_dim=320,
+        lstm_layer_norm=True,
+        lstm_layer_norm_epsilon=1e-5,
+        lstm_dropout=0.3,
+        joiner_activation="tanh",
+        attndim=256,
+        biasing=biasing,
+        charlist=charlist,
+        deepbiasing=True,
+        tcpsche=30,
+        DBaverage=False,
+    )
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/models/rnnt_decoder.py b/MLPY/Lib/site-packages/torchaudio/prototype/models/rnnt_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..129a1df27bcb8692556f3c00a58c1855c0da8ded
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/models/rnnt_decoder.py
@@ -0,0 +1,399 @@
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+from torchaudio.models import RNNT
+from torchaudio.prototype.models.rnnt import TrieNode
+
+__all__ = ["Hypothesis", "RNNTBeamSearchBiasing"]
+
+
+Hypothesis = Tuple[List[int], torch.Tensor, List[List[torch.Tensor]], float, list]
+Hypothesis.__doc__ = """Hypothesis generated by RNN-T beam search decoder,
+    represented as tuple of (tokens, prediction network output, prediction network state, score).
+    """
+
+
+def _get_hypo_tokens(hypo: Hypothesis) -> List[int]:
+    return hypo[0]
+
+
+def _get_hypo_predictor_out(hypo: Hypothesis) -> torch.Tensor:
+    return hypo[1]
+
+
+def _get_hypo_state(hypo: Hypothesis) -> List[List[torch.Tensor]]:
+    return hypo[2]
+
+
+def _get_hypo_score(hypo: Hypothesis) -> float:
+    return hypo[3]
+
+
+def _get_hypo_trie(hypo: Hypothesis) -> TrieNode:
+    return hypo[4]
+
+
+def _set_hypo_trie(hypo: Hypothesis, trie: TrieNode) -> None:
+    hypo[4] = trie
+
+
+def _get_hypo_key(hypo: Hypothesis) -> str:
+    return str(hypo[0])
+
+
+def _batch_state(hypos: List[Hypothesis]) -> List[List[torch.Tensor]]:
+    states: List[List[torch.Tensor]] = []
+    for i in range(len(_get_hypo_state(hypos[0]))):
+        batched_state_components: List[torch.Tensor] = []
+        for j in range(len(_get_hypo_state(hypos[0])[i])):
+            batched_state_components.append(torch.cat([_get_hypo_state(hypo)[i][j] for hypo in hypos]))
+        states.append(batched_state_components)
+    return states
+
+
+def _slice_state(states: List[List[torch.Tensor]], idx: int, device: torch.device) -> List[List[torch.Tensor]]:
+    idx_tensor = torch.tensor([idx], device=device)
+    return [[state.index_select(0, idx_tensor) for state in state_tuple] for state_tuple in states]
+
+
+def _default_hypo_sort_key(hypo: Hypothesis) -> float:
+    return _get_hypo_score(hypo) / (len(_get_hypo_tokens(hypo)) + 1)
+
+
+def _compute_updated_scores(
+    hypos: List[Hypothesis],
+    next_token_probs: torch.Tensor,
+    beam_width: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    hypo_scores = torch.tensor([_get_hypo_score(h) for h in hypos]).unsqueeze(1)
+    nonblank_scores = hypo_scores + next_token_probs[:, :-1]  # [beam_width, num_tokens - 1]
+    nonblank_nbest_scores, nonblank_nbest_idx = nonblank_scores.reshape(-1).topk(beam_width)
+    nonblank_nbest_hypo_idx = nonblank_nbest_idx.div(nonblank_scores.shape[1], rounding_mode="trunc")
+    nonblank_nbest_token = nonblank_nbest_idx % nonblank_scores.shape[1]
+    return nonblank_nbest_scores, nonblank_nbest_hypo_idx, nonblank_nbest_token
+
+
+def _remove_hypo(hypo: Hypothesis, hypo_list: List[Hypothesis]) -> None:
+    for i, elem in enumerate(hypo_list):
+        if _get_hypo_key(hypo) == _get_hypo_key(elem):
+            del hypo_list[i]
+            break
+
+
+class RNNTBeamSearchBiasing(torch.nn.Module):
+    r"""Beam search decoder for RNN-T model with biasing support.
+
+    Args:
+        model (RNNT): RNN-T model to use.
+        blank (int): index of blank token in vocabulary.
+        temperature (float, optional): temperature to apply to joint network output.
+            Larger values yield more uniform samples. (Default: 1.0)
+        hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score
+            for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns
+            hypothesis score normalized by token sequence length. (Default: None)
+        step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100)
+        trie (list, optional): the prefix tree for TCPGen biasing
+        biasing (bool, optional): If true, do biasing, otherwise use standard RNN-T support
+    """
+
+    def __init__(
+        self,
+        model: RNNT,
+        blank: int,
+        temperature: float = 1.0,
+        hypo_sort_key: Optional[Callable[[Hypothesis], float]] = None,
+        step_max_tokens: int = 100,
+        trie: TrieNode = None,
+        biasing: bool = False,
+    ) -> None:
+        super().__init__()
+        self.model = model
+        self.blank = blank
+        self.temperature = temperature
+        self.resettrie = trie or []
+        self.dobiasing = biasing
+
+        if hypo_sort_key is None:
+            self.hypo_sort_key = _default_hypo_sort_key
+        else:
+            self.hypo_sort_key = hypo_sort_key
+
+        self.step_max_tokens = step_max_tokens
+
+    def _init_b_hypos(self, hypo: Optional[Hypothesis], device: torch.device) -> List[Hypothesis]:
+        if hypo is not None:
+            token = _get_hypo_tokens(hypo)[-1]
+            state = _get_hypo_state(hypo)
+        else:
+            token = self.blank
+            state = None
+
+        one_tensor = torch.tensor([1], device=device)
+        pred_out, _, pred_state = self.model.predict(torch.tensor([[token]], device=device), one_tensor, state)
+        init_hypo = ([token], pred_out[0].detach(), pred_state, 0.0, self.resettrie)
+        return [init_hypo]
+
+    def _get_trie_mask(self, trie):
+        step_mask = torch.ones(len(self.model.char_list) + 1)
+        step_mask[list(trie[0].keys())] = 0
+        # step_mask[-1] = 0
+        return step_mask
+
+    def _get_generation_prob(self, trie):
+        if len(trie[0].keys()) == 0:
+            return True
+        else:
+            return False
+
+    def _gen_next_token_probs(
+        self, enc_out: torch.Tensor, hypos: List[Hypothesis], device: torch.device
+    ) -> torch.Tensor:
+        one_tensor = torch.tensor([1], device=device)
+        predictor_out = torch.stack([_get_hypo_predictor_out(h) for h in hypos], dim=0)
+        if self.dobiasing:
+            # Get valid subset of wordpieces
+            trie_masks = torch.stack([self._get_trie_mask(_get_hypo_trie(h)) for h in hypos], dim=0)
+            trie_masks = trie_masks.to(enc_out.device).unsqueeze(1)  # beam_width, 1, nchars
+            # Determine if there is any paths on the trie
+            genprob_masks = torch.tensor([self._get_generation_prob(_get_hypo_trie(h)) for h in hypos])  # beam_width
+            genprob_masks = genprob_masks.to(enc_out.device)
+            # Forward TCPGen component
+            last_tokens = torch.tensor([_get_hypo_tokens(h)[-1] for h in hypos]).unsqueeze(-1).to(enc_out.device)
+            hptr, tcpgen_dist = self.model.forward_tcpgen(last_tokens, trie_masks, enc_out)
+        else:
+            hptr = None
+        # hptr sent to joiner, if deepbiasing is True joiner will use it
+        joined_out, _, joined_activation = self.model.join(
+            enc_out,
+            one_tensor,
+            predictor_out,
+            torch.tensor([1] * len(hypos), device=device),
+            hptr=hptr,
+        )  # [beam_width, 1, 1, num_tokens]
+        if self.dobiasing:
+            p_gen = torch.sigmoid(self.model.pointer_gate(torch.cat((joined_activation, hptr), dim=-1)))
+            p_gen = p_gen.masked_fill(genprob_masks.view(p_gen.size(0), 1, 1, 1), 0)
+            model_tu = torch.softmax(joined_out / self.temperature, dim=3)
+            # assuming last token is blank
+            p_not_null = 1.0 - model_tu[:, :, :, -1:]
+            ptr_dist_fact = torch.cat([tcpgen_dist[:, :, :, :-2], tcpgen_dist[:, :, :, -1:]], dim=-1) * p_not_null
+            ptr_gen_complement = tcpgen_dist[:, :, :, -1:] * p_gen
+            p_partial = ptr_dist_fact[:, :, :, :-1] * p_gen + model_tu[:, :, :, :-1] * (1 - p_gen + ptr_gen_complement)
+            p_final = torch.cat([p_partial, model_tu[:, :, :, -1:]], dim=-1)
+            joined_out = torch.log(p_final)
+        else:
+            joined_out = torch.nn.functional.log_softmax(joined_out / self.temperature, dim=3)
+        return joined_out[:, 0, 0]
+
+    def _gen_b_hypos(
+        self,
+        b_hypos: List[Hypothesis],
+        a_hypos: List[Hypothesis],
+        next_token_probs: torch.Tensor,
+        key_to_b_hypo: Dict[str, Hypothesis],
+    ) -> List[Hypothesis]:
+        for i in range(len(a_hypos)):
+            h_a = a_hypos[i]
+            append_blank_score = _get_hypo_score(h_a) + next_token_probs[i, -1]
+            if _get_hypo_key(h_a) in key_to_b_hypo:
+                h_b = key_to_b_hypo[_get_hypo_key(h_a)]
+                _remove_hypo(h_b, b_hypos)
+                score = float(torch.tensor(_get_hypo_score(h_b)).logaddexp(append_blank_score))
+            else:
+                score = float(append_blank_score)
+            h_b = (
+                _get_hypo_tokens(h_a),
+                _get_hypo_predictor_out(h_a),
+                _get_hypo_state(h_a),
+                score,
+                _get_hypo_trie(h_a),
+            )
+            b_hypos.append(h_b)
+            key_to_b_hypo[_get_hypo_key(h_b)] = h_b
+        _, sorted_idx = torch.tensor([_get_hypo_score(hypo) for hypo in b_hypos]).sort()
+        return [b_hypos[idx] for idx in sorted_idx]
+
+    def _gen_a_hypos(
+        self,
+        a_hypos: List[Hypothesis],
+        b_hypos: List[Hypothesis],
+        next_token_probs: torch.Tensor,
+        t: int,
+        beam_width: int,
+        device: torch.device,
+    ) -> List[Hypothesis]:
+        (
+            nonblank_nbest_scores,
+            nonblank_nbest_hypo_idx,
+            nonblank_nbest_token,
+        ) = _compute_updated_scores(a_hypos, next_token_probs, beam_width)
+
+        if len(b_hypos) < beam_width:
+            b_nbest_score = -float("inf")
+        else:
+            b_nbest_score = _get_hypo_score(b_hypos[-beam_width])
+
+        base_hypos: List[Hypothesis] = []
+        new_tokens: List[int] = []
+        new_scores: List[float] = []
+        for i in range(beam_width):
+            score = float(nonblank_nbest_scores[i])
+            if score > b_nbest_score:
+                a_hypo_idx = int(nonblank_nbest_hypo_idx[i])
+                base_hypos.append(a_hypos[a_hypo_idx])
+                new_tokens.append(int(nonblank_nbest_token[i]))
+                new_scores.append(score)
+
+        if base_hypos:
+            new_hypos = self._gen_new_hypos(base_hypos, new_tokens, new_scores, t, device)
+        else:
+            new_hypos: List[Hypothesis] = []
+
+        return new_hypos
+
+    def _gen_new_hypos(
+        self,
+        base_hypos: List[Hypothesis],
+        tokens: List[int],
+        scores: List[float],
+        t: int,
+        device: torch.device,
+    ) -> List[Hypothesis]:
+        tgt_tokens = torch.tensor([[token] for token in tokens], device=device)
+        states = _batch_state(base_hypos)
+        pred_out, _, pred_states = self.model.predict(
+            tgt_tokens,
+            torch.tensor([1] * len(base_hypos), device=device),
+            states,
+        )
+        new_hypos: List[Hypothesis] = []
+        for i, h_a in enumerate(base_hypos):
+            new_tokens = _get_hypo_tokens(h_a) + [tokens[i]]
+            if self.dobiasing:
+                new_trie = self.model.get_tcpgen_step(tokens[i], _get_hypo_trie(h_a), self.resettrie)
+            else:
+                new_trie = self.resettrie
+            new_hypos.append(
+                (new_tokens, pred_out[i].detach(), _slice_state(pred_states, i, device), scores[i], new_trie)
+            )
+        return new_hypos
+
+    def _search(
+        self,
+        enc_out: torch.Tensor,
+        hypo: Optional[Hypothesis],
+        beam_width: int,
+    ) -> List[Hypothesis]:
+        n_time_steps = enc_out.shape[1]
+        device = enc_out.device
+
+        a_hypos: List[Hypothesis] = []
+        b_hypos = self._init_b_hypos(hypo, device)
+        for t in range(n_time_steps):
+            a_hypos = b_hypos
+            b_hypos = torch.jit.annotate(List[Hypothesis], [])
+            key_to_b_hypo: Dict[str, Hypothesis] = {}
+            symbols_current_t = 0
+
+            while a_hypos:
+                next_token_probs = self._gen_next_token_probs(enc_out[:, t : t + 1], a_hypos, device)
+                next_token_probs = next_token_probs.cpu()
+                b_hypos = self._gen_b_hypos(b_hypos, a_hypos, next_token_probs, key_to_b_hypo)
+
+                if symbols_current_t == self.step_max_tokens:
+                    break
+
+                a_hypos = self._gen_a_hypos(
+                    a_hypos,
+                    b_hypos,
+                    next_token_probs,
+                    t,
+                    beam_width,
+                    device,
+                )
+                if a_hypos:
+                    symbols_current_t += 1
+
+            _, sorted_idx = torch.tensor([self.hypo_sort_key(hypo) for hypo in b_hypos]).topk(beam_width)
+            b_hypos = [b_hypos[idx] for idx in sorted_idx]
+
+        return b_hypos
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        length: torch.Tensor,
+        beam_width: int,
+    ) -> List[Hypothesis]:
+        r"""Performs beam search for the given input sequence.
+
+        T: number of frames;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
+            length (torch.Tensor): number of valid frames in input
+                sequence, with shape () or (1,).
+            beam_width (int): beam size to use during search.
+
+        Returns:
+            List[Hypothesis]: top-``beam_width`` hypotheses found by beam search.
+        """
+        if input.dim() != 2 and not (input.dim() == 3 and input.shape[0] == 1):
+            raise ValueError("input must be of shape (T, D) or (1, T, D)")
+        if input.dim() == 2:
+            input = input.unsqueeze(0)
+
+        if length.shape != () and length.shape != (1,):
+            raise ValueError("length must be of shape () or (1,)")
+        if input.dim() == 0:
+            input = input.unsqueeze(0)
+
+        enc_out, _ = self.model.transcribe(input, length)
+        return self._search(enc_out, None, beam_width)
+
+    @torch.jit.export
+    def infer(
+        self,
+        input: torch.Tensor,
+        length: torch.Tensor,
+        beam_width: int,
+        state: Optional[List[List[torch.Tensor]]] = None,
+        hypothesis: Optional[Hypothesis] = None,
+    ) -> Tuple[List[Hypothesis], List[List[torch.Tensor]]]:
+        r"""Performs beam search for the given input sequence in streaming mode.
+
+        T: number of frames;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
+            length (torch.Tensor): number of valid frames in input
+                sequence, with shape () or (1,).
+            beam_width (int): beam size to use during search.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing transcription network internal state generated in preceding
+                invocation. (Default: ``None``)
+            hypothesis (Hypothesis or None): hypothesis from preceding invocation to seed
+                search with. (Default: ``None``)
+
+        Returns:
+            (List[Hypothesis], List[List[torch.Tensor]]):
+                List[Hypothesis]
+                    top-``beam_width`` hypotheses found by beam search.
+                List[List[torch.Tensor]]
+                    list of lists of tensors representing transcription network
+                    internal state generated in current invocation.
+        """
+        if input.dim() != 2 and not (input.dim() == 3 and input.shape[0] == 1):
+            raise ValueError("input must be of shape (T, D) or (1, T, D)")
+        if input.dim() == 2:
+            input = input.unsqueeze(0)
+
+        if length.shape != () and length.shape != (1,):
+            raise ValueError("length must be of shape () or (1,)")
+        if length.dim() == 0:
+            length = length.unsqueeze(0)
+
+        enc_out, _, state = self.model.transcribe_streaming(input, length, state)
+        return self._search(enc_out, hypothesis, beam_width), state
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__init__.py b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..83da7aa43c6e387adb0cf2281cb2da70409145e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__init__.py
@@ -0,0 +1,12 @@
+from ._vggish import VGGISH, VGGishBundle
+from .hifigan_pipeline import HIFIGAN_VOCODER_V3_LJSPEECH, HiFiGANVocoderBundle
+from .rnnt_pipeline import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3
+
+__all__ = [
+    "EMFORMER_RNNT_BASE_MUSTC",
+    "EMFORMER_RNNT_BASE_TEDLIUM3",
+    "HIFIGAN_VOCODER_V3_LJSPEECH",
+    "HiFiGANVocoderBundle",
+    "VGGISH",
+    "VGGishBundle",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71178f893b52665d8f4602358696757fb7a3f519
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__pycache__/hifigan_pipeline.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__pycache__/hifigan_pipeline.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae9ffe5b47e8d58187cf4cabf629382763a893f3
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__pycache__/hifigan_pipeline.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__pycache__/rnnt_pipeline.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__pycache__/rnnt_pipeline.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f32849e25266306c46595a478f0d891016dfeabd
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/__pycache__/rnnt_pipeline.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__init__.py b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abec68e4d4d45bcd6a74820413bf5dc6b56869f4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__init__.py
@@ -0,0 +1,3 @@
+from ._vggish_pipeline import VGGISH, VGGishBundle
+
+__all__ = ["VGGISH", "VGGishBundle"]
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b909674c24c1af73ac2bcccf77f510d314811627
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__pycache__/_vggish_impl.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__pycache__/_vggish_impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41aea41d2640616348d4c38fc0e689123d6c219a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__pycache__/_vggish_impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__pycache__/_vggish_pipeline.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__pycache__/_vggish_pipeline.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..881819e93c4da2e1663d6390cc5033200e086efe
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/__pycache__/_vggish_pipeline.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/_vggish_impl.py b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/_vggish_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32613720cf7f78a81d9d185a75c2e873975e6cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/_vggish_impl.py
@@ -0,0 +1,233 @@
+# Derived from torchvggish (https://github.com/harritaylor/torchvggish).
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import math
+
+import torch
+
+
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
+_MEL_HIGH_FREQUENCY_Q = 1127.0
+
+
+_SAMPLE_RATE = 16000
+_STFT_WINDOW_LENGTH_SECONDS = 0.025
+_STFT_HOP_LENGTH_SECONDS = 0.010
+_MEL_MIN_HZ = 125
+_MEL_MAX_HZ = 7500
+_NUM_BANDS = 64
+_LOG_OFFSET = 0.01
+_EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
+_EXAMPLE_HOP_SECONDS = 0.96  # with zero overlap.
+
+
+def _build_features_network():
+    layers = []
+
+    for input_dim, output_dim in [(1, 64), (64, 128)]:
+        layers += [
+            torch.nn.Conv2d(input_dim, output_dim, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
+            torch.nn.ReLU(inplace=True),
+            torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
+        ]
+
+    for input_dim, output_dim in [(128, 256), (256, 512)]:
+        layers += [
+            torch.nn.Conv2d(input_dim, output_dim, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
+            torch.nn.ReLU(inplace=True),
+            torch.nn.Conv2d(
+                output_dim,
+                output_dim,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+            ),
+            torch.nn.ReLU(inplace=True),
+            torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
+        ]
+
+    return torch.nn.Sequential(*layers)
+
+
+def _build_embedding_network():
+    return torch.nn.Sequential(
+        torch.nn.Linear(512 * 4 * 6, 4096),
+        torch.nn.ReLU(True),
+        torch.nn.Linear(4096, 4096),
+        torch.nn.ReLU(True),
+        torch.nn.Linear(4096, 128),
+        torch.nn.ReLU(True),
+    )
+
+
+def _frame(data, window_length, hop_length):
+    num_samples = data.shape[0]
+    num_frames = 1 + int(math.floor((num_samples - window_length) / hop_length))
+    shape = (num_frames, window_length) + data.shape[1:]
+    strides = (data.stride()[0] * hop_length,) + data.stride()
+    return torch.as_strided(data, shape, strides)
+
+
+def _stft_magnitude(signal, fft_length, hop_length=None, window_length=None):
+    frames = _frame(signal, window_length, hop_length)
+    window = torch.hann_window(window_length, periodic=True).to(signal.device)
+    windowed_frames = frames * window
+    return torch.abs(torch.fft.rfft(windowed_frames, int(fft_length)))
+
+
+def _hertz_to_mel(frequencies_hertz):
+    return _MEL_HIGH_FREQUENCY_Q * torch.log(1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
+
+
+def _spectrogram_to_mel_matrix(
+    num_mel_bins=20,
+    num_spectrogram_bins=129,
+    audio_sample_rate=8000,
+    lower_edge_hertz=125.0,
+    upper_edge_hertz=3800.0,
+):
+    nyquist_hertz = audio_sample_rate / 2.0
+    if lower_edge_hertz < 0.0:
+        raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
+    if lower_edge_hertz >= upper_edge_hertz:
+        raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" % (lower_edge_hertz, upper_edge_hertz))
+
+    if upper_edge_hertz > nyquist_hertz:
+        raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" % (upper_edge_hertz, nyquist_hertz))
+    spectrogram_bins_hertz = torch.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
+
+    spectrogram_bins_mel = _hertz_to_mel(spectrogram_bins_hertz)
+    # The i'th mel band (starting from i=1) has center frequency
+    # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
+    # band_edges_mel[i+1].  Thus, we need num_mel_bins + 2 values in
+    # the band_edges_mel arrays.
+    band_edges_mel = torch.linspace(
+        _hertz_to_mel(torch.tensor(lower_edge_hertz)),
+        _hertz_to_mel(torch.tensor(upper_edge_hertz)),
+        num_mel_bins + 2,
+    )
+    # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
+    # of spectrogram values.
+    mel_weights_matrix = torch.empty((num_spectrogram_bins, num_mel_bins))
+    for i in range(num_mel_bins):
+        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i : i + 3]
+        # Calculate lower and upper slopes for every spectrogram bin.
+        # Line segments are linear in the *mel* domain, not hertz.
+        lower_slope = (spectrogram_bins_mel - lower_edge_mel) / (center_mel - lower_edge_mel)
+        upper_slope = (upper_edge_mel - spectrogram_bins_mel) / (upper_edge_mel - center_mel)
+
+        # .. then intersect them with each other and zero.
+        mel_weights_matrix[:, i] = torch.maximum(torch.tensor(0.0), torch.minimum(lower_slope, upper_slope))
+
+    # HTK excludes the spectrogram DC bin; make sure it always gets a zero
+    # coefficient.
+    mel_weights_matrix[0, :] = 0.0
+    return mel_weights_matrix
+
+
+def _log_mel_spectrogram(
+    data,
+    audio_sample_rate=8000,
+    log_offset=0.0,
+    window_length_secs=0.025,
+    hop_length_secs=0.010,
+    **kwargs,
+):
+    window_length_samples = int(round(audio_sample_rate * window_length_secs))
+    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
+    fft_length = 2 ** int(math.ceil(math.log(window_length_samples) / math.log(2.0)))
+
+    spectrogram = _stft_magnitude(
+        data,
+        fft_length=fft_length,
+        hop_length=hop_length_samples,
+        window_length=window_length_samples,
+    )
+    mel_spectrogram = torch.matmul(
+        spectrogram,
+        _spectrogram_to_mel_matrix(
+            num_spectrogram_bins=spectrogram.shape[1],
+            audio_sample_rate=audio_sample_rate,
+            **kwargs,
+        ).to(spectrogram),
+    )
+    return torch.log(mel_spectrogram + log_offset)
+
+
+def _waveform_to_examples(data):
+    # Compute log mel spectrogram features, with shape (n_frame, n_mel)
+    log_mel = _log_mel_spectrogram(
+        data,
+        audio_sample_rate=_SAMPLE_RATE,
+        log_offset=_LOG_OFFSET,
+        window_length_secs=_STFT_WINDOW_LENGTH_SECONDS,
+        hop_length_secs=_STFT_HOP_LENGTH_SECONDS,
+        num_mel_bins=_NUM_BANDS,
+        lower_edge_hertz=_MEL_MIN_HZ,
+        upper_edge_hertz=_MEL_MAX_HZ,
+    )
+
+    # Frame features into examples, with shape (n_example, n_frame, n_mel)
+    features_sample_rate = 1.0 / _STFT_HOP_LENGTH_SECONDS
+    example_window_length = int(round(_EXAMPLE_WINDOW_SECONDS * features_sample_rate))
+
+    example_hop_length = int(round(_EXAMPLE_HOP_SECONDS * features_sample_rate))
+    log_mel_examples = _frame(log_mel, window_length=example_window_length, hop_length=example_hop_length)
+
+    # (n_example, 1, n_frame, n_mel)
+    return log_mel_examples.unsqueeze(1)
+
+
+class VGGish(torch.nn.Module):
+    """Implementation of VGGish model :cite:`45611`."""
+
+    def __init__(self):
+        super().__init__()
+
+        self.features_network = _build_features_network()
+        self.embedding_network = _build_embedding_network()
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): batch of spectrograms, with shape `(n_example, 1, n_frame, 64)`.
+
+        Returns:
+            torch.Tensor: model output, with shape `(n_example, 128)`.
+        """
+        x = self.features_network(input)
+
+        x = x.permute(0, 2, 3, 1)
+        x = x.reshape(x.size(0), -1)
+
+        return self.embedding_network(x)
+
+
+class VGGishInputProcessor:
+    """Converts raw waveforms to batches of examples to use as inputs to VGGish."""
+
+    def __call__(self, input: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): waveform, with shape `(T,)`.
+            sample_rate (int): sample rate of waveform in hertz.
+
+        Returns:
+            torch.Tensor: batch of examples to pass to VGGish, with shape `(n_example, 1, n_frame, 64)`.
+        """
+        if len(input.shape) != 1:
+            raise ValueError("input waveform must have dimension of 1.")
+        return _waveform_to_examples(input)
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..f67fe8ca169dcf19389391cac877056f606a6f8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py
@@ -0,0 +1,82 @@
+from dataclasses import dataclass
+from typing import Callable, Dict
+
+import torch
+import torchaudio
+
+from ._vggish_impl import _SAMPLE_RATE, VGGish as _VGGish, VGGishInputProcessor as _VGGishInputProcessor
+
+
+def _get_state_dict():
+    path = torchaudio.utils.download_asset("models/vggish.pt")
+    return torch.load(path)
+
+
+@dataclass
+class VGGishBundle:
+    """VGGish :cite:`45611` inference pipeline ported from
+    `torchvggish <https://github.com/harritaylor/torchvggish>`__
+    and `tensorflow-models <https://github.com/tensorflow/models/tree/master/research/audioset>`__.
+
+    Example:
+        >>> import torchaudio
+        >>> from torchaudio.prototype.pipelines import VGGISH
+        >>>
+        >>> input_sr = VGGISH.sample_rate
+        >>> input_proc = VGGISH.get_input_processor()
+        >>> model = VGGISH.get_model()
+        >>>
+        >>> waveform, sr = torchaudio.load(
+        >>>     "Chopin_Ballade_-1_In_G_Minor,_Op._23.mp3",
+        >>> )
+        >>> waveform = waveform.squeeze(0)
+        >>> waveform = torchaudio.functional.resample(waveform, sr, input_sr)
+        >>> mono_output = model(input_proc(waveform))
+    """
+
+    class VGGish(_VGGish):
+        __doc__ = _VGGish.__doc__
+
+    class VGGishInputProcessor(_VGGishInputProcessor):
+        __doc__ = _VGGishInputProcessor.__doc__
+
+    _state_dict_func: Callable[[], Dict]
+
+    @property
+    def sample_rate(self) -> int:
+        """Sample rate of input waveform expected by input processor and model.
+
+        :type: int
+        """
+        return _SAMPLE_RATE
+
+    def get_model(self) -> VGGish:
+        """Constructs pre-trained VGGish model. Downloads and caches weights as necessary.
+
+        Returns:
+            VGGish: VGGish model with pre-trained weights loaded.
+        """
+        model = self.VGGish()
+        state_dict = self._state_dict_func()
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+    def get_input_processor(self) -> VGGishInputProcessor:
+        """Constructs input processor for VGGish.
+
+        Returns:
+            VGGishInputProcessor: input processor for VGGish.
+        """
+        return self.VGGishInputProcessor()
+
+
+VGGISH = VGGishBundle(_get_state_dict)
+VGGISH.__doc__ = """Pre-trained VGGish :cite:`45611` inference pipeline ported from
+    `torchvggish <https://github.com/harritaylor/torchvggish>`__
+    and `tensorflow-models <https://github.com/tensorflow/models/tree/master/research/audioset>`__.
+
+    Per the `documentation <https://github.com/tensorflow/models/tree/master/research/audioset/vggish>`__
+    for the original model, the model is "trained on a large YouTube dataset (a preliminary version of
+    what later became YouTube-8M)".
+    """
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/hifigan_pipeline.py b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/hifigan_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c5a14e0731302de5bb716c902dcc9325aa42271
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/hifigan_pipeline.py
@@ -0,0 +1,228 @@
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn import Module
+from torchaudio._internal import load_state_dict_from_url
+
+from torchaudio.prototype.models.hifi_gan import hifigan_vocoder, HiFiGANVocoder
+from torchaudio.transforms import MelSpectrogram
+
+
+@dataclass
+class HiFiGANVocoderBundle:
+    """Data class that bundles associated information to use pretrained
+    :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    This bundle can convert mel spectrorgam to waveforms and vice versa. A typical use case would be a flow like
+    `text -> mel spectrogram -> waveform`, where one can use an external component, e.g. Tacotron2,
+    to generate mel spectrogram from text. Please see below for the code example.
+
+    Example: Transform synthetic mel spectrogram to audio.
+        >>> import torch
+        >>> import torchaudio
+        >>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
+        >>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle
+        >>>
+        >>> # Load the HiFiGAN bundle
+        >>> vocoder = bundle.get_vocoder()
+        Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
+        100%|████████████| 5.59M/5.59M [00:00<00:00, 18.7MB/s]
+        >>>
+        >>> # Generate synthetic mel spectrogram
+        >>> specgram = torch.sin(0.5 * torch.arange(start=0, end=100)).expand(bundle._vocoder_params["in_channels"], 100)
+        >>>
+        >>> # Transform mel spectrogram into audio
+        >>> waveform = vocoder(specgram)
+        >>> torchaudio.save('sample.wav', waveform, bundle.sample_rate)
+
+    Example: Usage together with Tacotron2, text to audio.
+        >>> import torch
+        >>> import torchaudio
+        >>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
+        >>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle_hifigan
+        >>>
+        >>> # Load Tacotron2 bundle
+        >>> bundle_tactron2 = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
+        >>> processor = bundle_tactron2.get_text_processor()
+        >>> tacotron2 = bundle_tactron2.get_tacotron2()
+        >>>
+        >>> # Use Tacotron2 to convert text to mel spectrogram
+        >>> text = "A quick brown fox jumped over a lazy dog"
+        >>> input, lengths = processor(text)
+        >>> specgram, lengths, _ = tacotron2.infer(input, lengths)
+        >>>
+        >>> # Load HiFiGAN bundle
+        >>> vocoder = bundle_hifigan.get_vocoder()
+        Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
+        100%|████████████| 5.59M/5.59M [00:03<00:00, 1.55MB/s]
+        >>>
+        >>> # Use HiFiGAN to convert mel spectrogram to audio
+        >>> waveform = vocoder(specgram).squeeze(0)
+        >>> torchaudio.save('sample.wav', waveform, bundle_hifigan.sample_rate)
+    """  # noqa: E501
+
+    _path: str
+    _vocoder_params: Dict[str, Any]  # Vocoder parameters
+    _mel_params: Dict[str, Any]  # Mel transformation parameters
+    _sample_rate: float
+
+    def _get_state_dict(self, dl_kwargs):
+        url = f"https://download.pytorch.org/torchaudio/models/{self._path}"
+        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+        state_dict = load_state_dict_from_url(url, **dl_kwargs)
+        return state_dict
+
+    def get_vocoder(self, *, dl_kwargs=None) -> HiFiGANVocoder:
+        """Construct the HiFiGAN Generator model, which can be used a vocoder, and load the pretrained weight.
+
+        The weight file is downloaded from the internet and cached with
+        :func:`torch.hub.load_state_dict_from_url`
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Variation of :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
+        """
+        model = hifigan_vocoder(**self._vocoder_params)
+        model.load_state_dict(self._get_state_dict(dl_kwargs))
+        model.eval()
+        return model
+
+    def get_mel_transform(self) -> Module:
+        """Construct an object which transforms waveforms into mel spectrograms."""
+        return _HiFiGANMelSpectrogram(
+            n_mels=self._vocoder_params["in_channels"],
+            sample_rate=self._sample_rate,
+            **self._mel_params,
+        )
+
+    @property
+    def sample_rate(self):
+        """Sample rate of the audio that the model is trained on.
+
+        :type: float
+        """
+        return self._sample_rate
+
+
+class _HiFiGANMelSpectrogram(torch.nn.Module):
+    """
+    Generate mel spectrogram in a way equivalent to the original HiFiGAN implementation:
+    https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72
+
+    This class wraps around :py:class:`torchaudio.transforms.MelSpectrogram`, but performs extra steps to achive
+    equivalence with the HiFiGAN implementation.
+
+    Args:
+        hop_size (int): Length of hop between STFT windows.
+        n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins.
+        win_length (int): Window size.
+        f_min (float or None):  Minimum frequency.
+        f_max (float or None): Maximum frequency.
+        sample_rate (int):  Sample rate of audio signal.
+        n_mels (int):  Number of mel filterbanks.
+    """
+
+    def __init__(
+        self,
+        hop_size: int,
+        n_fft: int,
+        win_length: int,
+        f_min: Optional[float],
+        f_max: Optional[float],
+        sample_rate: float,
+        n_mels: int,
+    ):
+        super(_HiFiGANMelSpectrogram, self).__init__()
+        self.mel_transform = MelSpectrogram(
+            sample_rate=sample_rate,
+            n_fft=n_fft,
+            win_length=win_length,
+            hop_length=hop_size,
+            f_min=f_min,
+            f_max=f_max,
+            n_mels=n_mels,
+            normalized=False,
+            pad=0,
+            mel_scale="slaney",
+            norm="slaney",
+            center=False,
+        )
+        self.sample_rate = sample_rate
+        self.hop_size = hop_size
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.f_min = f_min
+        self.f_max = f_max
+        self.n_mels = n_mels
+        self.pad_size = int((n_fft - hop_size) / 2)
+
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        """Generate mel spectrogram from a waveform. Should have same sample rate as ``self.sample_rate``.
+
+        Args:
+            waveform (Tensor): waveform of shape ``(batch_size, time_length)``.
+        Returns:
+            Tensor of shape ``(batch_size, n_mel, time_length)``
+        """
+        ref_waveform = F.pad(waveform.unsqueeze(1), (self.pad_size, self.pad_size), mode="reflect")
+        ref_waveform = ref_waveform.squeeze(1)
+
+        spectr = (self.mel_transform.spectrogram(ref_waveform) + 1e-9) ** 0.5
+        mel_spectrogram = self.mel_transform.mel_scale(spectr)
+        mel_spectrogram = torch.log(torch.clamp(mel_spectrogram, min=1e-5))
+        return mel_spectrogram
+
+
+HIFIGAN_VOCODER_V3_LJSPEECH = HiFiGANVocoderBundle(
+    "hifigan_vocoder_v3_ljspeech.pth",
+    _vocoder_params={
+        "upsample_rates": (8, 8, 4),
+        "upsample_kernel_sizes": (16, 16, 8),
+        "upsample_initial_channel": 256,
+        "resblock_kernel_sizes": (3, 5, 7),
+        "resblock_dilation_sizes": ((1, 2), (2, 6), (3, 12)),
+        "resblock_type": 2,
+        "in_channels": 80,
+        "lrelu_slope": 0.1,
+    },
+    _mel_params={
+        "hop_size": 256,
+        "n_fft": 1024,
+        "win_length": 1024,
+        "f_min": 0,
+        "f_max": 8000,
+    },
+    _sample_rate=22050,
+)
+HIFIGAN_VOCODER_V3_LJSPEECH.__doc__ = """HiFiGAN Vocoder pipeline, trained on *The LJ Speech Dataset*
+    :cite:`ljspeech17`.
+
+    This pipeine can be used with an external component which generates mel spectrograms from text, for example,
+    Tacotron2 - see examples in :py:class:`HiFiGANVocoderBundle`.
+    Although this works with the existing Tacotron2 bundles, for the best results one needs to retrain Tacotron2
+    using the same data preprocessing pipeline which was used for training HiFiGAN. In particular, the original
+    HiFiGAN implementation uses a custom method of generating mel spectrograms from waveforms, different from
+    :py:class:`torchaudio.transforms.MelSpectrogram`. We reimplemented this transform as
+    :py:meth:`HiFiGANVocoderBundle.get_mel_transform`, making sure it is equivalent to the original HiFiGAN code `here
+    <https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72>`_.
+
+    The underlying vocoder is constructed by
+    :py:func:`torchaudio.prototype.models.hifigan_vocoder`. The weights are converted from the ones published
+    with the original paper :cite:`NEURIPS2020_c5d73680` under `MIT License
+    <https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/LICENSE>`__. See links to
+    pre-trained models on `GitHub <https://github.com/jik876/hifi-gan#pretrained-model>`__.
+
+    Please refer to :py:class:`HiFiGANVocoderBundle` for usage instructions.
+    """
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/rnnt_pipeline.py b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/rnnt_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..20783ecdab5980252ac0f9490877b2de2e4f53a9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/pipelines/rnnt_pipeline.py
@@ -0,0 +1,58 @@
+from functools import partial
+
+from torchaudio.models import emformer_rnnt_base
+from torchaudio.pipelines import RNNTBundle
+
+
+EMFORMER_RNNT_BASE_MUSTC = RNNTBundle(
+    _rnnt_path="models/emformer_rnnt_base_mustc.pt",
+    _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
+    _global_stats_path="pipeline-assets/global_stats_rnnt_mustc.json",
+    _sp_model_path="pipeline-assets/spm_bpe_500_mustc.model",
+    _right_padding=4,
+    _blank=500,
+    _sample_rate=16000,
+    _n_fft=400,
+    _n_mels=80,
+    _hop_length=160,
+    _segment_length=16,
+    _right_context_length=4,
+)
+EMFORMER_RNNT_BASE_MUSTC.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both
+streaming and non-streaming inference.
+
+The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
+and utilizes weights trained on *MuST-C release v2.0* :cite:`CATTONI2021101155` dataset
+using training script ``train.py``
+`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__
+with ``num_symbols=501``.
+
+Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
+"""
+
+
+EMFORMER_RNNT_BASE_TEDLIUM3 = RNNTBundle(
+    _rnnt_path="models/emformer_rnnt_base_tedlium3.pt",
+    _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
+    _global_stats_path="pipeline-assets/global_stats_rnnt_tedlium3.json",
+    _sp_model_path="pipeline-assets/spm_bpe_500_tedlium3.model",
+    _right_padding=4,
+    _blank=500,
+    _sample_rate=16000,
+    _n_fft=400,
+    _n_mels=80,
+    _hop_length=160,
+    _segment_length=16,
+    _right_context_length=4,
+)
+EMFORMER_RNNT_BASE_TEDLIUM3.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both
+streaming and non-streaming inference.
+
+The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
+and utilizes weights trained on *TED-LIUM Release 3* :cite:`rousseau2012tedlium` dataset
+using training script ``train.py``
+`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__
+with ``num_symbols=501``.
+
+Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
+"""
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/transforms/__init__.py b/MLPY/Lib/site-packages/torchaudio/prototype/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6242f3a4e7c0dec9a255ba97069de7ef52ddc957
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/transforms/__init__.py
@@ -0,0 +1,9 @@
+from ._transforms import BarkScale, BarkSpectrogram, ChromaScale, ChromaSpectrogram, InverseBarkScale
+
+__all__ = [
+    "BarkScale",
+    "BarkSpectrogram",
+    "ChromaScale",
+    "ChromaSpectrogram",
+    "InverseBarkScale",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/transforms/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/transforms/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aadf37450e4425b8b552c931f32fd65abe57b853
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/transforms/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/transforms/__pycache__/_transforms.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/prototype/transforms/__pycache__/_transforms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e9acdbf37e41039fe1e61f34b5e026eacd736af
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/prototype/transforms/__pycache__/_transforms.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/prototype/transforms/_transforms.py b/MLPY/Lib/site-packages/torchaudio/prototype/transforms/_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0fa10824eb759f8b7e925455bdbfe2184ec7beb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/prototype/transforms/_transforms.py
@@ -0,0 +1,456 @@
+from typing import Callable, Optional
+
+import torch
+from torchaudio.prototype.functional import barkscale_fbanks, chroma_filterbank
+from torchaudio.transforms import Spectrogram
+
+
+class BarkScale(torch.nn.Module):
+    r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
+        norm (str or None, optional): If ``"slaney"``, divide the triangular bark weights by the width of the bark band
+            (area normalization). (Default: ``None``)
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
+        >>> spectrogram = spectrogram_transform(waveform)
+        >>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
+        >>> barkscale_spectrogram = barkscale_transform(spectrogram)
+
+    See also:
+        :py:func:`torchaudio.prototype.functional.barkscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["n_barks", "sample_rate", "f_min", "f_max"]
+
+    def __init__(
+        self,
+        n_barks: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        n_stft: int = 201,
+        bark_scale: str = "traunmuller",
+    ) -> None:
+        super(BarkScale, self).__init__()
+        self.n_barks = n_barks
+        self.sample_rate = sample_rate
+        self.f_max = f_max if f_max is not None else float(sample_rate // 2)
+        self.f_min = f_min
+        self.bark_scale = bark_scale
+
+        if f_min > self.f_max:
+            raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
+
+        fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, self.bark_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, specgram: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            specgram (torch.Tensor): A spectrogram STFT of dimension (..., freq, time).
+
+        Returns:
+            torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
+        """
+
+        # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
+        bark_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
+
+        return bark_specgram
+
+
+class InverseBarkScale(torch.nn.Module):
+    r"""Estimate a STFT in normal frequency domain from bark frequency domain.
+
+    .. devices:: CPU CUDA
+
+    It minimizes the euclidian norm between the input bark-spectrogram and the product between
+    the estimated spectrogram and the filter banks using SGD.
+
+    Args:
+        n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
+        n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
+        tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
+        tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
+        sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024)
+        >>> mel_spectrogram = bark_spectrogram_transform(waveform)
+        >>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1)
+        >>> spectrogram = inverse_barkscale_transform(mel_spectrogram)
+    """
+    __constants__ = [
+        "n_stft",
+        "n_barks",
+        "sample_rate",
+        "f_min",
+        "f_max",
+        "max_iter",
+        "tolerance_loss",
+        "tolerance_change",
+        "sgdargs",
+    ]
+
+    def __init__(
+        self,
+        n_stft: int,
+        n_barks: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        max_iter: int = 100000,
+        tolerance_loss: float = 1e-5,
+        tolerance_change: float = 1e-8,
+        sgdargs: Optional[dict] = None,
+        bark_scale: str = "traunmuller",
+    ) -> None:
+        super(InverseBarkScale, self).__init__()
+        self.n_barks = n_barks
+        self.sample_rate = sample_rate
+        self.f_max = f_max or float(sample_rate // 2)
+        self.f_min = f_min
+        self.max_iter = max_iter
+        self.tolerance_loss = tolerance_loss
+        self.tolerance_change = tolerance_change
+        self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
+
+        if f_min > self.f_max:
+            raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
+
+        fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, bark_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, barkspec: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            barkspec (torch.Tensor): A Bark frequency spectrogram of dimension (..., ``n_barks``, time)
+
+        Returns:
+            torch.Tensor: Linear scale spectrogram of size (..., freq, time)
+        """
+        # pack batch
+        shape = barkspec.size()
+        barkspec = barkspec.view(-1, shape[-2], shape[-1])
+
+        n_barks, time = shape[-2], shape[-1]
+        freq, _ = self.fb.size()  # (freq, n_mels)
+        barkspec = barkspec.transpose(-1, -2)
+        if self.n_barks != n_barks:
+            raise ValueError("Expected an input with {} bark bins. Found: {}".format(self.n_barks, n_barks))
+
+        specgram = torch.rand(
+            barkspec.size()[0], time, freq, requires_grad=True, dtype=barkspec.dtype, device=barkspec.device
+        )
+
+        optim = torch.optim.SGD([specgram], **self.sgdargs)
+
+        loss = float("inf")
+        for _ in range(self.max_iter):
+            optim.zero_grad()
+            diff = barkspec - specgram.matmul(self.fb)
+            new_loss = diff.pow(2).sum(axis=-1).mean()
+            # take sum over bark-frequency then average over other dimensions
+            # so that loss threshold is applied par unit timeframe
+            new_loss.backward()
+            optim.step()
+            specgram.data = specgram.data.clamp(min=0)
+
+            new_loss = new_loss.item()
+            if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
+                break
+            loss = new_loss
+
+        specgram.requires_grad_(False)
+        specgram = specgram.clamp(min=0).transpose(-1, -2)
+
+        # unpack batch
+        specgram = specgram.view(shape[:-2] + (freq, time))
+        return specgram
+
+
+class BarkSpectrogram(torch.nn.Module):
+    r"""Create BarkSpectrogram for a raw audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
+    and :py:func:`torchaudio.transforms.BarkScale`.
+
+    Sources
+        * https://www.fon.hum.uva.nl/praat/manual/BarkSpectrogram.html
+        * Traunmüller, Hartmut. "Analytical Expressions for the Tonotopic Sensory Scale." Journal of the Acoustical
+        * Society of America. Vol. 88, Issue 1, 1990, pp. 97–100.
+        * https://ccrma.stanford.edu/courses/120-fall-2003/lecture-5.html
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``None``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
+        normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
+        wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.BarkSpectrogram(sample_rate)
+        >>> bark_specgram = transform(waveform)  # (channel, n_barks, time)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_barks", "f_min"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        pad: int = 0,
+        n_barks: int = 128,
+        window_fn: Callable[..., torch.Tensor] = torch.hann_window,
+        power: float = 2.0,
+        normalized: bool = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        bark_scale: str = "traunmuller",
+    ) -> None:
+        super(BarkSpectrogram, self).__init__()
+
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        self.pad = pad
+        self.power = power
+        self.normalized = normalized
+        self.n_barks = n_barks  # number of bark frequency bins
+        self.f_max = f_max
+        self.f_min = f_min
+        self.spectrogram = Spectrogram(
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            pad=self.pad,
+            window_fn=window_fn,
+            power=self.power,
+            normalized=self.normalized,
+            wkwargs=wkwargs,
+            center=center,
+            pad_mode=pad_mode,
+            onesided=True,
+        )
+        self.bark_scale = BarkScale(
+            self.n_barks, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, bark_scale
+        )
+
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            waveform (torch.Tensor): torch.Tensor of audio of dimension (..., time).
+
+        Returns:
+            torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
+        """
+        specgram = self.spectrogram(waveform)
+        bark_specgram = self.bark_scale(specgram)
+        return bark_specgram
+
+
+class ChromaScale(torch.nn.Module):
+    r"""Converts spectrogram to chromagram.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd
+
+    Args:
+        sample_rate (int): Sample rate of audio signal.
+        n_freqs (int): Number of frequency bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
+        n_chroma (int, optional): Number of chroma. (Default: ``12``)
+        tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
+        ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
+        octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
+            If ``None``, then disable weighting altogether. (Default: 2.0)
+        norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
+        base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
+        >>> spectrogram = spectrogram_transform(waveform)
+        >>> chroma_transform = transforms.ChromaScale(sample_rate=sample_rate, n_freqs=1024 // 2 + 1)
+        >>> chroma_spectrogram = chroma_transform(spectrogram)
+
+    See also:
+        :py:func:`torchaudio.prototype.functional.chroma_filterbank` — function used to
+        generate the filter bank.
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        n_freqs: int,
+        *,
+        n_chroma: int = 12,
+        tuning: float = 0.0,
+        ctroct: float = 5.0,
+        octwidth: Optional[float] = 2.0,
+        norm: int = 2,
+        base_c: bool = True,
+    ):
+        super().__init__()
+        fb = chroma_filterbank(
+            sample_rate, n_freqs, n_chroma, tuning=tuning, ctroct=ctroct, octwidth=octwidth, norm=norm, base_c=base_c
+        )
+        self.register_buffer("fb", fb)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            specgram (torch.Tensor): Spectrogram of dimension (..., ``n_freqs``, time).
+
+        Returns:
+            torch.Tensor: Chroma spectrogram of size (..., ``n_chroma``, time).
+        """
+        return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
+
+
+class ChromaSpectrogram(torch.nn.Module):
+    r"""Generates chromagram for audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd
+
+    Composes :py:func:`torchaudio.transforms.Spectrogram` and
+    and :py:func:`torchaudio.prototype.transforms.ChromaScale`.
+
+    Args:
+        sample_rate (int): Sample rate of audio signal.
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins.
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
+        normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
+        wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        n_chroma (int, optional): Number of chroma. (Default: ``12``)
+        tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
+        ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
+        octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
+            If ``None``, then disable weighting altogether. (Default: 2.0)
+        norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
+        base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.ChromaSpectrogram(sample_rate=sample_rate, n_fft=400)
+        >>> chromagram = transform(waveform)  # (channel, n_chroma, time)
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        n_fft: int,
+        *,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        pad: int = 0,
+        window_fn: Callable[..., torch.Tensor] = torch.hann_window,
+        power: float = 2.0,
+        normalized: bool = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        n_chroma: int = 12,
+        tuning: float = 0.0,
+        ctroct: float = 5.0,
+        octwidth: Optional[float] = 2.0,
+        norm: int = 2,
+        base_c: bool = True,
+    ):
+        super().__init__()
+        self.spectrogram = Spectrogram(
+            n_fft=n_fft,
+            win_length=win_length,
+            hop_length=hop_length,
+            pad=pad,
+            window_fn=window_fn,
+            power=power,
+            normalized=normalized,
+            wkwargs=wkwargs,
+            center=center,
+            pad_mode=pad_mode,
+            onesided=True,
+        )
+        self.chroma_scale = ChromaScale(
+            sample_rate,
+            n_fft // 2 + 1,
+            n_chroma=n_chroma,
+            tuning=tuning,
+            base_c=base_c,
+            ctroct=ctroct,
+            octwidth=octwidth,
+            norm=norm,
+        )
+
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Chromagram of size (..., ``n_chroma``, time).
+        """
+        spectrogram = self.spectrogram(waveform)
+        chroma_spectrogram = self.chroma_scale(spectrogram)
+        return chroma_spectrogram
diff --git a/MLPY/Lib/site-packages/torchaudio/sox_effects/__init__.py b/MLPY/Lib/site-packages/torchaudio/sox_effects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c65a49b277c25c36273b888c1ac2861cb3ce9a0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/sox_effects/__init__.py
@@ -0,0 +1,10 @@
+from .sox_effects import apply_effects_file, apply_effects_tensor, effect_names, init_sox_effects, shutdown_sox_effects
+
+
+__all__ = [
+    "init_sox_effects",
+    "shutdown_sox_effects",
+    "effect_names",
+    "apply_effects_tensor",
+    "apply_effects_file",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/sox_effects/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/sox_effects/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e67975cd44829d7c43e4f8b635c2f2df1234da03
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/sox_effects/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/sox_effects/__pycache__/sox_effects.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/sox_effects/__pycache__/sox_effects.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a101c55d477fbb66b3dfbbc6f88ee382c3cd9a95
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/sox_effects/__pycache__/sox_effects.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/sox_effects/sox_effects.py b/MLPY/Lib/site-packages/torchaudio/sox_effects/sox_effects.py
new file mode 100644
index 0000000000000000000000000000000000000000..b80e4bfad689925ba4a08503ebaeddcbe5cc6a5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/sox_effects/sox_effects.py
@@ -0,0 +1,272 @@
+import os
+from typing import List, Optional, Tuple
+
+import torch
+import torchaudio
+from torchaudio._internal.module_utils import deprecated
+from torchaudio.utils.sox_utils import list_effects
+
+
+sox_ext = torchaudio._extension.lazy_import_sox_ext()
+
+
+@deprecated("Please remove the call. This function is called automatically.")
+def init_sox_effects():
+    """Initialize resources required to use sox effects.
+
+    Note:
+        You do not need to call this function manually. It is called automatically.
+
+    Once initialized, you do not need to call this function again across the multiple uses of
+    sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
+    Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
+    again will result in error.
+    """
+    pass
+
+
+@deprecated("Please remove the call. This function is called automatically.")
+def shutdown_sox_effects():
+    """Clean up resources required to use sox effects.
+
+    Note:
+        You do not need to call this function manually. It is called automatically.
+
+    It is safe to call this function multiple times.
+    Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
+    initializing again will result in error.
+    """
+    pass
+
+
+def effect_names() -> List[str]:
+    """Gets list of valid sox effect names
+
+    Returns:
+        List[str]: list of available effect names.
+
+    Example
+        >>> torchaudio.sox_effects.effect_names()
+        ['allpass', 'band', 'bandpass', ... ]
+    """
+    return list(list_effects().keys())
+
+
+def apply_effects_tensor(
+    tensor: torch.Tensor,
+    sample_rate: int,
+    effects: List[List[str]],
+    channels_first: bool = True,
+) -> Tuple[torch.Tensor, int]:
+    """Apply sox effects to given Tensor
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        This function only works on CPU Tensors.
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` command adds certain effects automatically (such as
+        ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
+        only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
+        need to give ``rate`` effect with desired sampling rate.).
+
+    Args:
+        tensor (torch.Tensor): Input 2D CPU Tensor.
+        sample_rate (int): Sample rate
+        effects (List[List[str]]): List of effects.
+        channels_first (bool, optional): Indicates if the input Tensor's dimension is
+            `[channels, time]` or `[time, channels]`
+
+    Returns:
+        (Tensor, int): Resulting Tensor and sample rate.
+        The resulting Tensor has the same ``dtype`` as the input Tensor, and
+        the same channels order. The shape of the Tensor can be different based on the
+        effects applied. Sample rate can also be different based on the effects applied.
+
+    Example - Basic usage
+        >>>
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>>
+        >>> # Generate pseudo wave:
+        >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
+        >>> sample_rate = 16000
+        >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1
+        >>> waveform.shape
+        torch.Size([2, 16000])
+        >>> waveform
+        tensor([[ 0.3138,  0.7620, -0.9019,  ..., -0.7495, -0.4935,  0.5442],
+                [-0.0832,  0.0061,  0.8233,  ..., -0.5176, -0.9140, -0.2434]])
+        >>>
+        >>> # Apply effects
+        >>> waveform, sample_rate = apply_effects_tensor(
+        ...     wave_form, sample_rate, effects, channels_first=True)
+        >>>
+        >>> # Check the result
+        >>> # The new waveform is sampling rate 8000, 1 second.
+        >>> # normalization and channel order are preserved
+        >>> waveform.shape
+        torch.Size([2, 8000])
+        >>> waveform
+        tensor([[ 0.5054, -0.5518, -0.4800,  ..., -0.0076,  0.0096, -0.0110],
+                [ 0.1331,  0.0436, -0.3783,  ..., -0.0035,  0.0012,  0.0008]])
+        >>> sample_rate
+        8000
+
+    Example - Torchscript-able transform
+        >>>
+        >>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file,
+        >>> # then run sox effect via Torchscript runtime.
+        >>>
+        >>> class SoxEffectTransform(torch.nn.Module):
+        ...     effects: List[List[str]]
+        ...
+        ...     def __init__(self, effects: List[List[str]]):
+        ...         super().__init__()
+        ...         self.effects = effects
+        ...
+        ...     def forward(self, tensor: torch.Tensor, sample_rate: int):
+        ...         return sox_effects.apply_effects_tensor(
+        ...             tensor, sample_rate, self.effects)
+        ...
+        ...
+        >>> # Create transform object
+        >>> effects = [
+        ...     ["lowpass", "-1", "300"],  # apply single-pole lowpass filter
+        ...     ["rate", "8000"],  # change sample rate to 8000
+        ... ]
+        >>> transform = SoxEffectTensorTransform(effects, input_sample_rate)
+        >>>
+        >>> # Dump it to file and load
+        >>> path = 'sox_effect.zip'
+        >>> torch.jit.script(trans).save(path)
+        >>> transform = torch.jit.load(path)
+        >>>
+        >>>> # Run transform
+        >>> waveform, input_sample_rate = torchaudio.load("input.wav")
+        >>> waveform, sample_rate = transform(waveform, input_sample_rate)
+        >>> assert sample_rate == 8000
+    """
+    return sox_ext.apply_effects_tensor(tensor, sample_rate, effects, channels_first)
+
+
+def apply_effects_file(
+    path: str,
+    effects: List[List[str]],
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Apply sox effects to the audio file and load the resulting data as Tensor
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` commnad adds certain effects automatically (such as
+        ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
+        effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
+        effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
+        rate and leave samples untouched.
+
+    Args:
+        path (path-like object):
+            Source of audio data.
+        effects (List[List[str]]): List of effects.
+        normalize (bool, optional):
+            When ``True``, this function converts the native sample type to ``float32``.
+            Default: ``True``.
+
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type.
+            This argument has no effect for formats other than integer WAV type.
+
+        channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Override the format detection with the given format.
+            Providing the argument might help when libsox can not infer the format
+            from header or extension,
+
+    Returns:
+        (Tensor, int): Resulting Tensor and sample rate.
+        If ``normalize=True``, the resulting Tensor is always ``float32`` type.
+        If ``normalize=False`` and the input audio file is of integer WAV file, then the
+        resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
+        If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
+        otherwise `[time, channel]`.
+
+    Example - Basic usage
+        >>>
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>>
+        >>> # Apply effects and load data with channels_first=True
+        >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
+        >>>
+        >>> # Check the result
+        >>> waveform.shape
+        torch.Size([2, 8000])
+        >>> waveform
+        tensor([[ 5.1151e-03,  1.8073e-02,  2.2188e-02,  ...,  1.0431e-07,
+                 -1.4761e-07,  1.8114e-07],
+                [-2.6924e-03,  2.1860e-03,  1.0650e-02,  ...,  6.4122e-07,
+                 -5.6159e-07,  4.8103e-07]])
+        >>> sample_rate
+        8000
+
+    Example - Apply random speed perturbation to dataset
+        >>>
+        >>> # Load data from file, apply random speed perturbation
+        >>> class RandomPerturbationFile(torch.utils.data.Dataset):
+        ...     \"\"\"Given flist, apply random speed perturbation
+        ...
+        ...     Suppose all the input files are at least one second long.
+        ...     \"\"\"
+        ...     def __init__(self, flist: List[str], sample_rate: int):
+        ...         super().__init__()
+        ...         self.flist = flist
+        ...         self.sample_rate = sample_rate
+        ...
+        ...     def __getitem__(self, index):
+        ...         speed = 0.5 + 1.5 * random.randn()
+        ...         effects = [
+        ...             ['gain', '-n', '-10'],  # apply 10 db attenuation
+        ...             ['remix', '-'],  # merge all the channels
+        ...             ['speed', f'{speed:.5f}'],  # duration is now 0.5 ~ 2.0 seconds.
+        ...             ['rate', f'{self.sample_rate}'],
+        ...             ['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
+        ...             ['trim', '0', '2'],  # get the first 2 seconds
+        ...         ]
+        ...         waveform, _ = torchaudio.sox_effects.apply_effects_file(
+        ...             self.flist[index], effects)
+        ...         return waveform
+        ...
+        ...     def __len__(self):
+        ...         return len(self.flist)
+        ...
+        >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
+        >>> loader = torch.utils.data.DataLoader(dataset, batch_size=32)
+        >>> for batch in loader:
+        >>>     pass
+    """
+    if not torch.jit.is_scripting():
+        if hasattr(path, "read"):
+            raise RuntimeError(
+                "apply_effects_file function does not support file-like object. "
+                "Please use torchaudio.io.AudioEffector."
+            )
+        path = os.fspath(path)
+    return sox_ext.apply_effects_file(path, effects, normalize, channels_first, format)
diff --git a/MLPY/Lib/site-packages/torchaudio/transforms/__init__.py b/MLPY/Lib/site-packages/torchaudio/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd84516a001e93db4085229354b57a64b5213f3d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/transforms/__init__.py
@@ -0,0 +1,75 @@
+from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
+from ._transforms import (
+    AddNoise,
+    AmplitudeToDB,
+    ComputeDeltas,
+    Convolve,
+    Deemphasis,
+    Fade,
+    FFTConvolve,
+    FrequencyMasking,
+    GriffinLim,
+    InverseMelScale,
+    InverseSpectrogram,
+    LFCC,
+    Loudness,
+    MelScale,
+    MelSpectrogram,
+    MFCC,
+    MuLawDecoding,
+    MuLawEncoding,
+    PitchShift,
+    Preemphasis,
+    Resample,
+    RNNTLoss,
+    SlidingWindowCmn,
+    SpecAugment,
+    SpectralCentroid,
+    Spectrogram,
+    Speed,
+    SpeedPerturbation,
+    TimeMasking,
+    TimeStretch,
+    Vad,
+    Vol,
+)
+
+
+__all__ = [
+    "AddNoise",
+    "AmplitudeToDB",
+    "ComputeDeltas",
+    "Convolve",
+    "Deemphasis",
+    "Fade",
+    "FFTConvolve",
+    "FrequencyMasking",
+    "GriffinLim",
+    "InverseMelScale",
+    "InverseSpectrogram",
+    "LFCC",
+    "Loudness",
+    "MFCC",
+    "MVDR",
+    "MelScale",
+    "MelSpectrogram",
+    "MuLawDecoding",
+    "MuLawEncoding",
+    "PSD",
+    "PitchShift",
+    "Preemphasis",
+    "RNNTLoss",
+    "RTFMVDR",
+    "Resample",
+    "SlidingWindowCmn",
+    "SoudenMVDR",
+    "SpecAugment",
+    "SpectralCentroid",
+    "Spectrogram",
+    "Speed",
+    "SpeedPerturbation",
+    "TimeMasking",
+    "TimeStretch",
+    "Vad",
+    "Vol",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/transforms/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/transforms/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..062217144d5ae2152e7f678c8b54cbed79703450
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/transforms/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/transforms/__pycache__/_multi_channel.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/transforms/__pycache__/_multi_channel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..131a3761e904680ce1ea18f42169bf684b1734fa
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/transforms/__pycache__/_multi_channel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/transforms/__pycache__/_transforms.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/transforms/__pycache__/_transforms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a88cc32a673c29eb50d7737fc14a524a4f14fd12
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/transforms/__pycache__/_transforms.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/transforms/_multi_channel.py b/MLPY/Lib/site-packages/torchaudio/transforms/_multi_channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..956ccd2ee1526e56f647872a07a5f55957ce2381
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/transforms/_multi_channel.py
@@ -0,0 +1,467 @@
+# -*- coding: utf-8 -*-
+
+import warnings
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+from torchaudio import functional as F
+
+
+__all__ = []
+
+
+def _get_mvdr_vector(
+    psd_s: torch.Tensor,
+    psd_n: torch.Tensor,
+    reference_vector: torch.Tensor,
+    solution: str = "ref_channel",
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> torch.Tensor:
+    r"""Compute the MVDR beamforming weights with ``solution`` argument.
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_vector (torch.Tensor): one-hot reference channel matrix.
+        solution (str, optional): Solution to compute the MVDR beamforming weights.
+            Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+        eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+            (Default: ``1e-8``)
+
+    Returns:
+        torch.Tensor: the mvdr beamforming weight matrix
+    """
+    if solution == "ref_channel":
+        beamform_vector = F.mvdr_weights_souden(psd_s, psd_n, reference_vector, diagonal_loading, diag_eps, eps)
+    else:
+        if solution == "stv_evd":
+            stv = F.rtf_evd(psd_s)
+        else:
+            stv = F.rtf_power(psd_s, psd_n, reference_vector, diagonal_loading=diagonal_loading, diag_eps=diag_eps)
+        beamform_vector = F.mvdr_weights_rtf(stv, psd_n, reference_vector, diagonal_loading, diag_eps, eps)
+
+    return beamform_vector
+
+
+class PSD(torch.nn.Module):
+    r"""Compute cross-channel power spectral density (PSD) matrix.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        multi_mask (bool, optional): If ``True``, only accepts multi-channel Time-Frequency masks. (Default: ``False``)
+        normalize (bool, optional): If ``True``, normalize the mask along the time dimension. (Default: ``True``)
+        eps (float, optional): Value to add to the denominator in mask normalization. (Default: ``1e-15``)
+    """
+
+    def __init__(self, multi_mask: bool = False, normalize: bool = True, eps: float = 1e-15):
+        super().__init__()
+        self.multi_mask = multi_mask
+        self.normalize = normalize
+        self.eps = eps
+
+    def forward(self, specgram: torch.Tensor, mask: Optional[torch.Tensor] = None):
+        """
+        Args:
+            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+                Tensor with dimensions `(..., channel, freq, time)`.
+            mask (torch.Tensor or None, optional): Time-Frequency mask for normalization.
+                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False`` or
+                with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
+                (Default: ``None``)
+
+        Returns:
+            torch.Tensor: The complex-valued PSD matrix of the input spectrum.
+                Tensor with dimensions `(..., freq, channel, channel)`
+        """
+        if mask is not None:
+            if self.multi_mask:
+                # Averaging mask along channel dimension
+                mask = mask.mean(dim=-3)  # (..., freq, time)
+        psd = F.psd(specgram, mask, self.normalize, self.eps)
+
+        return psd
+
+
+class MVDR(torch.nn.Module):
+    """Minimum Variance Distortionless Response (MVDR) module that performs MVDR beamforming with Time-Frequency masks.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Based on https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/beamformer.py
+
+    We provide three solutions of MVDR beamforming. One is based on *reference channel selection*
+    :cite:`souden2009optimal` (``solution=ref_channel``).
+
+    .. math::
+        \\textbf{w}_{\\text{MVDR}}(f) =\
+        \\frac{{{\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f){\\bf{\\Phi}_{\\textbf{SS}}}}(f)}\
+        {\\text{Trace}({{{\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f) \\bf{\\Phi}_{\\textbf{SS}}}(f))}}\\bm{u}
+
+    where :math:`\\bf{\\Phi}_{\\textbf{SS}}` and :math:`\\bf{\\Phi}_{\\textbf{NN}}` are the covariance\
+        matrices of speech and noise, respectively. :math:`\\bf{u}` is an one-hot vector to determine the\
+         reference channel.
+
+    The other two solutions are based on the steering vector (``solution=stv_evd`` or ``solution=stv_power``).
+
+    .. math::
+        \\textbf{w}_{\\text{MVDR}}(f) =\
+        \\frac{{{\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f){\\bm{v}}(f)}}\
+        {{\\bm{v}^{\\mathsf{H}}}(f){\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f){\\bm{v}}(f)}
+
+    where :math:`\\bm{v}` is the acoustic transfer function or the steering vector.\
+        :math:`.^{\\mathsf{H}}` denotes the Hermitian Conjugate operation.
+
+    We apply either *eigenvalue decomposition*
+    :cite:`higuchi2016robust` or the *power method* :cite:`mises1929praktische` to get the
+    steering vector from the PSD matrix of speech.
+
+    After estimating the beamforming weight, the enhanced Short-time Fourier Transform (STFT) is obtained by
+
+    .. math::
+        \\hat{\\bf{S}} = {\\bf{w}^\\mathsf{H}}{\\bf{Y}}, {\\bf{w}} \\in \\mathbb{C}^{M \\times F}
+
+    where :math:`\\bf{Y}` and :math:`\\hat{\\bf{S}}` are the STFT of the multi-channel noisy speech and\
+        the single-channel enhanced speech, respectively.
+
+    For online streaming audio, we provide a *recursive method* :cite:`higuchi2017online` to update the
+    PSD matrices of speech and noise, respectively.
+
+    Args:
+        ref_channel (int, optional): Reference channel for beamforming. (Default: ``0``)
+        solution (str, optional): Solution to compute the MVDR beamforming weights.
+            Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
+        multi_mask (bool, optional): If ``True``, only accepts multi-channel Time-Frequency masks. (Default: ``False``)
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to the covariance matrix
+            of the noise. (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+        online (bool, optional): If ``True``, updates the MVDR beamforming weights based on
+            the previous covarience matrices. (Default: ``False``)
+
+    Note:
+        To improve the numerical stability, the input spectrogram will be converted to double precision
+        (``torch.complex128`` or ``torch.cdouble``) dtype for internal computation. The output spectrogram
+        is converted to the dtype of the input spectrogram to be compatible with other modules.
+
+    Note:
+        If you use ``stv_evd`` solution, the gradient of the same input may not be identical if the
+        eigenvalues of the PSD matrix are not distinct (i.e. some eigenvalues are close or identical).
+    """
+
+    def __init__(
+        self,
+        ref_channel: int = 0,
+        solution: str = "ref_channel",
+        multi_mask: bool = False,
+        diag_loading: bool = True,
+        diag_eps: float = 1e-7,
+        online: bool = False,
+    ):
+        super().__init__()
+        if solution not in [
+            "ref_channel",
+            "stv_evd",
+            "stv_power",
+        ]:
+            raise ValueError(
+                '`solution` must be one of ["ref_channel", "stv_evd", "stv_power"]. Given {}'.format(solution)
+            )
+        self.ref_channel = ref_channel
+        self.solution = solution
+        self.multi_mask = multi_mask
+        self.diag_loading = diag_loading
+        self.diag_eps = diag_eps
+        self.online = online
+        self.psd = PSD(multi_mask)
+
+        psd_s: torch.Tensor = torch.zeros(1)
+        psd_n: torch.Tensor = torch.zeros(1)
+        mask_sum_s: torch.Tensor = torch.zeros(1)
+        mask_sum_n: torch.Tensor = torch.zeros(1)
+        self.register_buffer("psd_s", psd_s)
+        self.register_buffer("psd_n", psd_n)
+        self.register_buffer("mask_sum_s", mask_sum_s)
+        self.register_buffer("mask_sum_n", mask_sum_n)
+
+    def _get_updated_mvdr_vector(
+        self,
+        psd_s: torch.Tensor,
+        psd_n: torch.Tensor,
+        mask_s: torch.Tensor,
+        mask_n: torch.Tensor,
+        reference_vector: torch.Tensor,
+        solution: str = "ref_channel",
+        diagonal_loading: bool = True,
+        diag_eps: float = 1e-7,
+        eps: float = 1e-8,
+    ) -> torch.Tensor:
+        r"""Recursively update the MVDR beamforming vector.
+
+        Args:
+            psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            mask_s (torch.Tensor): Time-Frequency mask of the target speech.
+                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
+                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
+            mask_n (torch.Tensor or None, optional): Time-Frequency mask of the noise.
+                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
+                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
+            reference_vector (torch.Tensor): One-hot reference channel matrix.
+            solution (str, optional): Solution to compute the MVDR beamforming weights.
+                Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
+            diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+                (Default: ``True``)
+            diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+                It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+            eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+                (Default: ``1e-8``)
+
+        Returns:
+            torch.Tensor: The MVDR beamforming weight matrix.
+        """
+        if self.multi_mask:
+            # Averaging mask along channel dimension
+            mask_s = mask_s.mean(dim=-3)  # (..., freq, time)
+            mask_n = mask_n.mean(dim=-3)  # (..., freq, time)
+        if self.psd_s.ndim == 1:
+            self.psd_s = psd_s
+            self.psd_n = psd_n
+            self.mask_sum_s = mask_s.sum(dim=-1)
+            self.mask_sum_n = mask_n.sum(dim=-1)
+            return _get_mvdr_vector(psd_s, psd_n, reference_vector, solution, diagonal_loading, diag_eps, eps)
+        else:
+            psd_s = self._get_updated_psd_speech(psd_s, mask_s)
+            psd_n = self._get_updated_psd_noise(psd_n, mask_n)
+            self.psd_s = psd_s
+            self.psd_n = psd_n
+            self.mask_sum_s = self.mask_sum_s + mask_s.sum(dim=-1)
+            self.mask_sum_n = self.mask_sum_n + mask_n.sum(dim=-1)
+            return _get_mvdr_vector(psd_s, psd_n, reference_vector, solution, diagonal_loading, diag_eps, eps)
+
+    def _get_updated_psd_speech(self, psd_s: torch.Tensor, mask_s: torch.Tensor) -> torch.Tensor:
+        r"""Update psd of speech recursively.
+
+        Args:
+            psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            mask_s (torch.Tensor): Time-Frequency mask of the target speech.
+                Tensor with dimensions `(..., freq, time)`.
+
+        Returns:
+            torch.Tensor: The updated PSD matrix of target speech.
+        """
+        numerator = self.mask_sum_s / (self.mask_sum_s + mask_s.sum(dim=-1))
+        denominator = 1 / (self.mask_sum_s + mask_s.sum(dim=-1))
+        psd_s = self.psd_s * numerator[..., None, None] + psd_s * denominator[..., None, None]
+        return psd_s
+
+    def _get_updated_psd_noise(self, psd_n: torch.Tensor, mask_n: torch.Tensor) -> torch.Tensor:
+        r"""Update psd of noise recursively.
+
+        Args:
+            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            mask_n (torch.Tensor or None, optional): Time-Frequency mask of the noise.
+                Tensor with dimensions `(..., freq, time)`.
+
+        Returns:
+            torch.Tensor:  The updated PSD matrix of noise.
+        """
+        numerator = self.mask_sum_n / (self.mask_sum_n + mask_n.sum(dim=-1))
+        denominator = 1 / (self.mask_sum_n + mask_n.sum(dim=-1))
+        psd_n = self.psd_n * numerator[..., None, None] + psd_n * denominator[..., None, None]
+        return psd_n
+
+    def forward(
+        self, specgram: torch.Tensor, mask_s: torch.Tensor, mask_n: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Perform MVDR beamforming.
+
+        Args:
+            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+                Tensor with dimensions `(..., channel, freq, time)`
+            mask_s (torch.Tensor): Time-Frequency mask of target speech.
+                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
+                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
+            mask_n (torch.Tensor or None, optional): Time-Frequency mask of noise.
+                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
+                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
+                (Default: None)
+
+        Returns:
+            torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
+        """
+        dtype = specgram.dtype
+        if specgram.ndim < 3:
+            raise ValueError(f"Expected at least 3D tensor (..., channel, freq, time). Found: {specgram.shape}")
+        if not specgram.is_complex():
+            raise ValueError(
+                f"The type of ``specgram`` tensor must be ``torch.cfloat`` or ``torch.cdouble``.\
+                    Found: {specgram.dtype}"
+            )
+        if specgram.dtype == torch.cfloat:
+            specgram = specgram.cdouble()  # Convert specgram to ``torch.cdouble``.
+
+        if mask_n is None:
+            warnings.warn("``mask_n`` is not provided, use ``1 - mask_s`` as ``mask_n``.")
+            mask_n = 1 - mask_s
+
+        psd_s = self.psd(specgram, mask_s)  # (..., freq, time, channel, channel)
+        psd_n = self.psd(specgram, mask_n)  # (..., freq, time, channel, channel)
+
+        u = torch.zeros(specgram.size()[:-2], device=specgram.device, dtype=torch.cdouble)  # (..., channel)
+        u[..., self.ref_channel].fill_(1)
+
+        if self.online:
+            w_mvdr = self._get_updated_mvdr_vector(
+                psd_s, psd_n, mask_s, mask_n, u, self.solution, self.diag_loading, self.diag_eps
+            )
+        else:
+            w_mvdr = _get_mvdr_vector(psd_s, psd_n, u, self.solution, self.diag_loading, self.diag_eps)
+
+        specgram_enhanced = F.apply_beamforming(w_mvdr, specgram)
+
+        return specgram_enhanced.to(dtype)
+
+
+class RTFMVDR(torch.nn.Module):
+    r"""Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) module
+    based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given the multi-channel complex-valued spectrum :math:`\textbf{Y}`, the relative transfer function (RTF) matrix
+    or the steering vector of target speech :math:`\bm{v}`, the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and
+    a one-hot vector that represents the reference channel :math:`\bf{u}`, the module computes the single-channel
+    complex-valued spectrum of the enhanced speech :math:`\hat{\textbf{S}}`. The formula is defined as:
+
+    .. math::
+        \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)
+
+    where :math:`\textbf{w}_{\text{bf}}(f)` is the MVDR beamforming weight for the :math:`f`-th frequency bin,
+    :math:`(.)^{\mathsf{H}}` denotes the Hermitian Conjugate operation.
+
+    The beamforming weight is computed by:
+
+    .. math::
+        \textbf{w}_{\text{MVDR}}(f) =
+        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}
+        {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}
+    """
+
+    def forward(
+        self,
+        specgram: Tensor,
+        rtf: Tensor,
+        psd_n: Tensor,
+        reference_channel: Union[int, Tensor],
+        diagonal_loading: bool = True,
+        diag_eps: float = 1e-7,
+        eps: float = 1e-8,
+    ) -> Tensor:
+        """
+        Args:
+            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+                Tensor with dimensions `(..., channel, freq, time)`
+            rtf (torch.Tensor): The complex-valued RTF vector of target speech.
+                Tensor with dimensions `(..., freq, channel)`.
+            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            reference_channel (int or torch.Tensor): Specifies the reference channel.
+                If the dtype is ``int``, it represents the reference channel index.
+                If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+                is one-hot.
+            diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+                (Default: ``True``)
+            diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+                It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+            eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+                (Default: ``1e-8``)
+
+        Returns:
+            torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
+        """
+        w_mvdr = F.mvdr_weights_rtf(rtf, psd_n, reference_channel, diagonal_loading, diag_eps, eps)
+        spectrum_enhanced = F.apply_beamforming(w_mvdr, specgram)
+        return spectrum_enhanced
+
+
+class SoudenMVDR(torch.nn.Module):
+    r"""Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) module
+    based on the method proposed by *Souden et, al.* :cite:`souden2009optimal`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given the multi-channel complex-valued spectrum :math:`\textbf{Y}`, the power spectral density (PSD) matrix
+    of target speech :math:`\bf{\Phi}_{\textbf{SS}}`, the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and
+    a one-hot vector that represents the reference channel :math:`\bf{u}`, the module computes the single-channel
+    complex-valued spectrum of the enhanced speech :math:`\hat{\textbf{S}}`. The formula is defined as:
+
+    .. math::
+        \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)
+
+    where :math:`\textbf{w}_{\text{bf}}(f)` is the MVDR beamforming weight for the :math:`f`-th frequency bin.
+
+    The beamforming weight is computed by:
+
+    .. math::
+        \textbf{w}_{\text{MVDR}}(f) =
+        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}
+        {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u}
+    """
+
+    def forward(
+        self,
+        specgram: Tensor,
+        psd_s: Tensor,
+        psd_n: Tensor,
+        reference_channel: Union[int, Tensor],
+        diagonal_loading: bool = True,
+        diag_eps: float = 1e-7,
+        eps: float = 1e-8,
+    ) -> torch.Tensor:
+        """
+        Args:
+            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+                Tensor with dimensions `(..., channel, freq, time)`.
+            psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            reference_channel (int or torch.Tensor): Specifies the reference channel.
+                If the dtype is ``int``, it represents the reference channel index.
+                If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+                is one-hot.
+            diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+                (Default: ``True``)
+            diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+                It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+            eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+                (Default: ``1e-8``)
+
+        Returns:
+            torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
+        """
+        w_mvdr = F.mvdr_weights_souden(psd_s, psd_n, reference_channel, diagonal_loading, diag_eps, eps)
+        spectrum_enhanced = F.apply_beamforming(w_mvdr, specgram)
+        return spectrum_enhanced
diff --git a/MLPY/Lib/site-packages/torchaudio/transforms/_transforms.py b/MLPY/Lib/site-packages/torchaudio/transforms/_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..242fd971a8d1efabb485a4cdbda2a8f1dbf59f02
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/transforms/_transforms.py
@@ -0,0 +1,2137 @@
+# -*- coding: utf-8 -*-
+
+import math
+import warnings
+from typing import Callable, Optional, Sequence, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.nn.modules.lazy import LazyModuleMixin
+from torch.nn.parameter import UninitializedParameter
+
+from torchaudio import functional as F
+from torchaudio.functional.functional import (
+    _apply_sinc_resample_kernel,
+    _check_convolve_mode,
+    _fix_waveform_shape,
+    _get_sinc_resample_kernel,
+    _stretch_waveform,
+)
+
+__all__ = []
+
+
+class Spectrogram(torch.nn.Module):
+    r"""Create a spectrogram from a audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float or None, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
+            If None, then the complex spectrum is returned instead. (Default: ``2``)
+        normalized (bool or str, optional): Whether to normalize by magnitude after stft. If input is str, choices are
+            ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
+            ``"window"``. (Default: ``False``)
+        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy (Default: ``True``)
+        return_complex (bool, optional):
+            Deprecated and not used.
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = torchaudio.transforms.Spectrogram(n_fft=800)
+        >>> spectrogram = transform(waveform)
+
+    """
+    __constants__ = ["n_fft", "win_length", "hop_length", "pad", "power", "normalized"]
+
+    def __init__(
+        self,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        pad: int = 0,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        power: Optional[float] = 2.0,
+        normalized: Union[bool, str] = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        onesided: bool = True,
+        return_complex: Optional[bool] = None,
+    ) -> None:
+        super(Spectrogram, self).__init__()
+        torch._C._log_api_usage_once("torchaudio.transforms.Spectrogram")
+        self.n_fft = n_fft
+        # number of FFT bins. the returned STFT result will have n_fft // 2 + 1
+        # number of frequencies due to onesided=True in torch.stft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        self.pad = pad
+        self.power = power
+        self.normalized = normalized
+        self.center = center
+        self.pad_mode = pad_mode
+        self.onesided = onesided
+        if return_complex is not None:
+            warnings.warn(
+                "`return_complex` argument is now deprecated and is not effective."
+                "`torchaudio.transforms.Spectrogram(power=None)` always returns a tensor with "
+                "complex dtype. Please remove the argument in the function call."
+            )
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Dimension (..., freq, time), where freq is
+            ``n_fft // 2 + 1`` where ``n_fft`` is the number of
+            Fourier bins, and time is the number of window hops (n_frame).
+        """
+        return F.spectrogram(
+            waveform,
+            self.pad,
+            self.window,
+            self.n_fft,
+            self.hop_length,
+            self.win_length,
+            self.power,
+            self.normalized,
+            self.center,
+            self.pad_mode,
+            self.onesided,
+        )
+
+
+class InverseSpectrogram(torch.nn.Module):
+    r"""Create an inverse spectrogram to recover an audio signal from a spectrogram.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        normalized (bool or str, optional): Whether the stft output was normalized by magnitude. If input is str,
+            choices are ``"window"`` and ``"frame_length"``, dependent on normalization mode. ``True`` maps to
+            ``"window"``. (Default: ``False``)
+        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether the signal in spectrogram was padded on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        onesided (bool, optional): controls whether spectrogram was used to return half of results to
+            avoid redundancy (Default: ``True``)
+
+    Example
+        >>> batch, freq, time = 2, 257, 100
+        >>> length = 25344
+        >>> spectrogram = torch.randn(batch, freq, time, dtype=torch.cdouble)
+        >>> transform = transforms.InverseSpectrogram(n_fft=512)
+        >>> waveform = transform(spectrogram, length)
+    """
+    __constants__ = ["n_fft", "win_length", "hop_length", "pad", "power", "normalized"]
+
+    def __init__(
+        self,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        pad: int = 0,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        normalized: Union[bool, str] = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        onesided: bool = True,
+    ) -> None:
+        super(InverseSpectrogram, self).__init__()
+        self.n_fft = n_fft
+        # number of FFT bins. the returned STFT result will have n_fft // 2 + 1
+        # number of frequencies due to onesided=True in torch.stft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        self.pad = pad
+        self.normalized = normalized
+        self.center = center
+        self.pad_mode = pad_mode
+        self.onesided = onesided
+
+    def forward(self, spectrogram: Tensor, length: Optional[int] = None) -> Tensor:
+        r"""
+        Args:
+            spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time).
+            length (int or None, optional): The output length of the waveform.
+
+        Returns:
+            Tensor: Dimension (..., time), Least squares estimation of the original signal.
+        """
+        return F.inverse_spectrogram(
+            spectrogram,
+            length,
+            self.pad,
+            self.window,
+            self.n_fft,
+            self.hop_length,
+            self.win_length,
+            self.normalized,
+            self.center,
+            self.pad_mode,
+            self.onesided,
+        )
+
+
+class GriffinLim(torch.nn.Module):
+    r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Implementation ported from
+    *librosa* :cite:`brian_mcfee-proc-scipy-2015`, *A fast Griffin-Lim algorithm* :cite:`6701851`
+    and *Signal estimation from modified short-time Fourier transform* :cite:`1172092`.
+
+    Args:
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        n_iter (int, optional): Number of iteration for phase recovery process. (Default: ``32``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
+        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
+        momentum (float, optional): The momentum parameter for fast Griffin-Lim.
+            Setting this to 0 recovers the original Griffin-Lim method.
+            Values near 1 can lead to faster convergence, but above 1 may not converge. (Default: ``0.99``)
+        length (int, optional): Array length of the expected output. (Default: ``None``)
+        rand_init (bool, optional): Initializes phase randomly if True and to zero otherwise. (Default: ``True``)
+
+    Example
+        >>> batch, freq, time = 2, 257, 100
+        >>> spectrogram = torch.randn(batch, freq, time)
+        >>> transform = transforms.GriffinLim(n_fft=512)
+        >>> waveform = transform(spectrogram)
+    """
+    __constants__ = ["n_fft", "n_iter", "win_length", "hop_length", "power", "length", "momentum", "rand_init"]
+
+    def __init__(
+        self,
+        n_fft: int = 400,
+        n_iter: int = 32,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        power: float = 2.0,
+        wkwargs: Optional[dict] = None,
+        momentum: float = 0.99,
+        length: Optional[int] = None,
+        rand_init: bool = True,
+    ) -> None:
+        super(GriffinLim, self).__init__()
+
+        if not (0 <= momentum < 1):
+            raise ValueError("momentum must be in the range [0, 1). Found: {}".format(momentum))
+
+        self.n_fft = n_fft
+        self.n_iter = n_iter
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        self.length = length
+        self.power = power
+        self.momentum = momentum
+        self.rand_init = rand_init
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor):
+                A magnitude-only STFT spectrogram of dimension (..., freq, frames)
+                where freq is ``n_fft // 2 + 1``.
+
+        Returns:
+            Tensor: waveform of (..., time), where time equals the ``length`` parameter if given.
+        """
+        return F.griffinlim(
+            specgram,
+            self.window,
+            self.n_fft,
+            self.hop_length,
+            self.win_length,
+            self.power,
+            self.n_iter,
+            self.momentum,
+            self.length,
+            self.rand_init,
+        )
+
+
+class AmplitudeToDB(torch.nn.Module):
+    r"""Turn a tensor from the power/amplitude scale to the decibel scale.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This output depends on the maximum value in the input tensor, and so
+    may return different values for an audio clip split into snippets vs. a
+    a full clip.
+
+    Args:
+        stype (str, optional): scale of input tensor (``"power"`` or ``"magnitude"``). The
+            power being the elementwise square of the magnitude. (Default: ``"power"``)
+        top_db (float or None, optional): minimum negative cut-off in decibels.  A reasonable
+            number is 80. (Default: ``None``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.AmplitudeToDB(stype="amplitude", top_db=80)
+        >>> waveform_db = transform(waveform)
+    """
+    __constants__ = ["multiplier", "amin", "ref_value", "db_multiplier"]
+
+    def __init__(self, stype: str = "power", top_db: Optional[float] = None) -> None:
+        super(AmplitudeToDB, self).__init__()
+        self.stype = stype
+        if top_db is not None and top_db < 0:
+            raise ValueError("top_db must be positive value")
+        self.top_db = top_db
+        self.multiplier = 10.0 if stype == "power" else 20.0
+        self.amin = 1e-10
+        self.ref_value = 1.0
+        self.db_multiplier = math.log10(max(self.amin, self.ref_value))
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Numerically stable implementation from Librosa.
+
+        https://librosa.org/doc/latest/generated/librosa.amplitude_to_db.html
+
+        Args:
+            x (Tensor): Input tensor before being converted to decibel scale.
+
+        Returns:
+            Tensor: Output tensor in decibel scale.
+        """
+        return F.amplitude_to_DB(x, self.multiplier, self.amin, self.db_multiplier, self.top_db)
+
+
+class MelScale(torch.nn.Module):
+    r"""Turn a normal STFT into a mel frequency STFT with triangular filter banks.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
+        norm (str or None, optional): If ``"slaney"``, divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
+        >>> spectrogram = spectrogram_transform(waveform)
+        >>> melscale_transform = transforms.MelScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
+        >>> melscale_spectrogram = melscale_transform(spectrogram)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["n_mels", "sample_rate", "f_min", "f_max"]
+
+    def __init__(
+        self,
+        n_mels: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        n_stft: int = 201,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+    ) -> None:
+        super(MelScale, self).__init__()
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.f_max = f_max if f_max is not None else float(sample_rate // 2)
+        self.f_min = f_min
+        self.norm = norm
+        self.mel_scale = mel_scale
+
+        if f_min > self.f_max:
+            raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
+
+        fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, self.norm, self.mel_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).
+
+        Returns:
+            Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
+        """
+
+        # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
+        mel_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
+
+        return mel_specgram
+
+
+class InverseMelScale(torch.nn.Module):
+    r"""Estimate a STFT in normal frequency domain from mel frequency domain.
+
+    .. devices:: CPU CUDA
+
+    It minimizes the euclidian norm between the input mel-spectrogram and the product between
+    the estimated spectrogram and the filter banks using `torch.linalg.lstsq`.
+
+    Args:
+        n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+        driver (str, optional): Name of the LAPACK/MAGMA method to be used for `torch.lstsq`.
+            For CPU inputs the valid values are ``"gels"``, ``"gelsy"``, ``"gelsd"``, ``"gelss"``.
+            For CUDA input, the only valid driver is ``"gels"``, which assumes that A is full-rank.
+            (Default: ``"gels``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> mel_spectrogram_transform = transforms.MelSpectrogram(sample_rate, n_fft=1024)
+        >>> mel_spectrogram = mel_spectrogram_transform(waveform)
+        >>> inverse_melscale_transform = transforms.InverseMelScale(n_stft=1024 // 2 + 1)
+        >>> spectrogram = inverse_melscale_transform(mel_spectrogram)
+    """
+    __constants__ = [
+        "n_stft",
+        "n_mels",
+        "sample_rate",
+        "f_min",
+        "f_max",
+    ]
+
+    def __init__(
+        self,
+        n_stft: int,
+        n_mels: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+        driver: str = "gels",
+    ) -> None:
+        super(InverseMelScale, self).__init__()
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.f_max = f_max or float(sample_rate // 2)
+        self.f_min = f_min
+        self.driver = driver
+
+        if f_min > self.f_max:
+            raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
+
+        if driver not in ["gels", "gelsy", "gelsd", "gelss"]:
+            raise ValueError(f'driver must be one of ["gels", "gelsy", "gelsd", "gelss"]. Found {driver}.')
+
+        fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, norm, mel_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, melspec: Tensor) -> Tensor:
+        r"""
+        Args:
+            melspec (Tensor): A Mel frequency spectrogram of dimension (..., ``n_mels``, time)
+
+        Returns:
+            Tensor: Linear scale spectrogram of size (..., freq, time)
+        """
+        # pack batch
+        shape = melspec.size()
+        melspec = melspec.view(-1, shape[-2], shape[-1])
+
+        n_mels, time = shape[-2], shape[-1]
+        freq, _ = self.fb.size()  # (freq, n_mels)
+        if self.n_mels != n_mels:
+            raise ValueError("Expected an input with {} mel bins. Found: {}".format(self.n_mels, n_mels))
+
+        specgram = torch.relu(torch.linalg.lstsq(self.fb.transpose(-1, -2)[None], melspec, driver=self.driver).solution)
+
+        # unpack batch
+        specgram = specgram.view(shape[:-2] + (freq, time))
+        return specgram
+
+
+class MelSpectrogram(torch.nn.Module):
+    r"""Create MelSpectrogram for a raw audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This is a composition of :py:func:`torchaudio.transforms.Spectrogram`
+    and :py:func:`torchaudio.transforms.MelScale`.
+
+    Sources
+        * https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe
+        * https://timsainb.github.io/spectrograms-mfccs-and-inversion-in-python.html
+        * http://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``None``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
+        normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
+        wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        onesided: Deprecated and unused.
+        norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.MelSpectrogram(sample_rate)
+        >>> mel_specgram = transform(waveform)  # (channel, n_mels, time)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_mels", "f_min"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        pad: int = 0,
+        n_mels: int = 128,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        power: float = 2.0,
+        normalized: bool = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        onesided: Optional[bool] = None,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+    ) -> None:
+        super(MelSpectrogram, self).__init__()
+        torch._C._log_api_usage_once("torchaudio.transforms.MelSpectrogram")
+
+        if onesided is not None:
+            warnings.warn(
+                "Argument 'onesided' has been deprecated and has no influence on the behavior of this module."
+            )
+
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        self.pad = pad
+        self.power = power
+        self.normalized = normalized
+        self.n_mels = n_mels  # number of mel frequency bins
+        self.f_max = f_max
+        self.f_min = f_min
+        self.spectrogram = Spectrogram(
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            pad=self.pad,
+            window_fn=window_fn,
+            power=self.power,
+            normalized=self.normalized,
+            wkwargs=wkwargs,
+            center=center,
+            pad_mode=pad_mode,
+            onesided=True,
+        )
+        self.mel_scale = MelScale(
+            self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, norm, mel_scale
+        )
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
+        """
+        specgram = self.spectrogram(waveform)
+        mel_specgram = self.mel_scale(specgram)
+        return mel_specgram
+
+
+class MFCC(torch.nn.Module):
+    r"""Create the Mel-frequency cepstrum coefficients from an audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    By default, this calculates the MFCC on the DB-scaled Mel spectrogram.
+    This is not the textbook implementation, but is implemented here to
+    give consistency with librosa.
+
+    This output depends on the maximum value in the input spectrogram, and so
+    may return different values for an audio clip split into snippets vs. a
+    a full clip.
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_mfcc (int, optional): Number of mfc coefficients to retain. (Default: ``40``)
+        dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``)
+        norm (str, optional): norm to use. (Default: ``"ortho"``)
+        log_mels (bool, optional): whether to use log-mel spectrograms instead of db-scaled. (Default: ``False``)
+        melkwargs (dict or None, optional): arguments for MelSpectrogram. (Default: ``None``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.MFCC(
+        >>>     sample_rate=sample_rate,
+        >>>     n_mfcc=13,
+        >>>     melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23, "center": False},
+        >>> )
+        >>> mfcc = transform(waveform)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["sample_rate", "n_mfcc", "dct_type", "top_db", "log_mels"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_mfcc: int = 40,
+        dct_type: int = 2,
+        norm: str = "ortho",
+        log_mels: bool = False,
+        melkwargs: Optional[dict] = None,
+    ) -> None:
+        super(MFCC, self).__init__()
+        supported_dct_types = [2]
+        if dct_type not in supported_dct_types:
+            raise ValueError("DCT type not supported: {}".format(dct_type))
+        self.sample_rate = sample_rate
+        self.n_mfcc = n_mfcc
+        self.dct_type = dct_type
+        self.norm = norm
+        self.top_db = 80.0
+        self.amplitude_to_DB = AmplitudeToDB("power", self.top_db)
+
+        melkwargs = melkwargs or {}
+        self.MelSpectrogram = MelSpectrogram(sample_rate=self.sample_rate, **melkwargs)
+
+        if self.n_mfcc > self.MelSpectrogram.n_mels:
+            raise ValueError("Cannot select more MFCC coefficients than # mel bins")
+        dct_mat = F.create_dct(self.n_mfcc, self.MelSpectrogram.n_mels, self.norm)
+        self.register_buffer("dct_mat", dct_mat)
+        self.log_mels = log_mels
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: specgram_mel_db of size (..., ``n_mfcc``, time).
+        """
+        mel_specgram = self.MelSpectrogram(waveform)
+        if self.log_mels:
+            log_offset = 1e-6
+            mel_specgram = torch.log(mel_specgram + log_offset)
+        else:
+            mel_specgram = self.amplitude_to_DB(mel_specgram)
+
+        # (..., time, n_mels) dot (n_mels, n_mfcc) -> (..., n_nfcc, time)
+        mfcc = torch.matmul(mel_specgram.transpose(-1, -2), self.dct_mat).transpose(-1, -2)
+        return mfcc
+
+
+class LFCC(torch.nn.Module):
+    r"""Create the linear-frequency cepstrum coefficients from an audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    By default, this calculates the LFCC on the DB-scaled linear filtered spectrogram.
+    This is not the textbook implementation, but is implemented here to
+    give consistency with librosa.
+
+    This output depends on the maximum value in the input spectrogram, and so
+    may return different values for an audio clip split into snippets vs. a
+    a full clip.
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_filter (int, optional): Number of linear filters to apply. (Default: ``128``)
+        n_lfcc (int, optional): Number of lfc coefficients to retain. (Default: ``40``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``None``)
+        dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``)
+        norm (str, optional): norm to use. (Default: ``"ortho"``)
+        log_lf (bool, optional): whether to use log-lf spectrograms instead of db-scaled. (Default: ``False``)
+        speckwargs (dict or None, optional): arguments for Spectrogram. (Default: ``None``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.LFCC(
+        >>>     sample_rate=sample_rate,
+        >>>     n_lfcc=13,
+        >>>     speckwargs={"n_fft": 400, "hop_length": 160, "center": False},
+        >>> )
+        >>> lfcc = transform(waveform)
+
+    See also:
+        :py:func:`torchaudio.functional.linear_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["sample_rate", "n_filter", "n_lfcc", "dct_type", "top_db", "log_lf"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_filter: int = 128,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        n_lfcc: int = 40,
+        dct_type: int = 2,
+        norm: str = "ortho",
+        log_lf: bool = False,
+        speckwargs: Optional[dict] = None,
+    ) -> None:
+        super(LFCC, self).__init__()
+        supported_dct_types = [2]
+        if dct_type not in supported_dct_types:
+            raise ValueError("DCT type not supported: {}".format(dct_type))
+        self.sample_rate = sample_rate
+        self.f_min = f_min
+        self.f_max = f_max if f_max is not None else float(sample_rate // 2)
+        self.n_filter = n_filter
+        self.n_lfcc = n_lfcc
+        self.dct_type = dct_type
+        self.norm = norm
+        self.top_db = 80.0
+        self.amplitude_to_DB = AmplitudeToDB("power", self.top_db)
+
+        speckwargs = speckwargs or {}
+        self.Spectrogram = Spectrogram(**speckwargs)
+
+        if self.n_lfcc > self.Spectrogram.n_fft:
+            raise ValueError("Cannot select more LFCC coefficients than # fft bins")
+
+        filter_mat = F.linear_fbanks(
+            n_freqs=self.Spectrogram.n_fft // 2 + 1,
+            f_min=self.f_min,
+            f_max=self.f_max,
+            n_filter=self.n_filter,
+            sample_rate=self.sample_rate,
+        )
+        self.register_buffer("filter_mat", filter_mat)
+
+        dct_mat = F.create_dct(self.n_lfcc, self.n_filter, self.norm)
+        self.register_buffer("dct_mat", dct_mat)
+        self.log_lf = log_lf
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Linear Frequency Cepstral Coefficients of size (..., ``n_lfcc``, time).
+        """
+        specgram = self.Spectrogram(waveform)
+
+        # (..., time, freq) dot (freq, n_filter) -> (..., n_filter, time)
+        specgram = torch.matmul(specgram.transpose(-1, -2), self.filter_mat).transpose(-1, -2)
+
+        if self.log_lf:
+            log_offset = 1e-6
+            specgram = torch.log(specgram + log_offset)
+        else:
+            specgram = self.amplitude_to_DB(specgram)
+
+        # (..., time, n_filter) dot (n_filter, n_lfcc) -> (..., n_lfcc, time)
+        lfcc = torch.matmul(specgram.transpose(-1, -2), self.dct_mat).transpose(-1, -2)
+        return lfcc
+
+
+class MuLawEncoding(torch.nn.Module):
+    r"""Encode signal based on mu-law companding.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    For more info see the
+    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
+
+    This algorithm assumes the signal has been scaled to between -1 and 1 and
+    returns a signal encoded with values from 0 to quantization_channels - 1
+
+    Args:
+        quantization_channels (int, optional): Number of channels. (Default: ``256``)
+
+    Example
+       >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+       >>> transform = torchaudio.transforms.MuLawEncoding(quantization_channels=512)
+       >>> mulawtrans = transform(waveform)
+
+    """
+    __constants__ = ["quantization_channels"]
+
+    def __init__(self, quantization_channels: int = 256) -> None:
+        super(MuLawEncoding, self).__init__()
+        self.quantization_channels = quantization_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""
+        Args:
+            x (Tensor): A signal to be encoded.
+
+        Returns:
+            Tensor: An encoded signal.
+        """
+        return F.mu_law_encoding(x, self.quantization_channels)
+
+
+class MuLawDecoding(torch.nn.Module):
+    r"""Decode mu-law encoded signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    For more info see the
+    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
+
+    This expects an input with values between 0 and ``quantization_channels - 1``
+    and returns a signal scaled between -1 and 1.
+
+    Args:
+        quantization_channels (int, optional): Number of channels. (Default: ``256``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = torchaudio.transforms.MuLawDecoding(quantization_channels=512)
+        >>> mulawtrans = transform(waveform)
+    """
+    __constants__ = ["quantization_channels"]
+
+    def __init__(self, quantization_channels: int = 256) -> None:
+        super(MuLawDecoding, self).__init__()
+        self.quantization_channels = quantization_channels
+
+    def forward(self, x_mu: Tensor) -> Tensor:
+        r"""
+        Args:
+            x_mu (Tensor): A mu-law encoded signal which needs to be decoded.
+
+        Returns:
+            Tensor: The signal decoded.
+        """
+        return F.mu_law_decoding(x_mu, self.quantization_channels)
+
+
+class Resample(torch.nn.Module):
+    r"""Resample a signal from one frequency to another. A resampling method can be given.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        If resampling on waveforms of higher precision than float32, there may be a small loss of precision
+        because the kernel is cached once as float32. If high precision resampling is important for your application,
+        the functional form will retain higher precision, but run slower because it does not cache the kernel.
+        Alternatively, you could rewrite a transform that caches a higher precision kernel.
+
+    Args:
+        orig_freq (int, optional): The original frequency of the signal. (Default: ``16000``)
+        new_freq (int, optional): The desired frequency. (Default: ``16000``)
+        resampling_method (str, optional): The resampling method to use.
+            Options: [``sinc_interp_hann``, ``sinc_interp_kaiser``] (Default: ``"sinc_interp_hann"``)
+        lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
+            but less efficient. (Default: ``6``)
+        rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
+            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
+        beta (float or None, optional): The shape parameter used for kaiser window.
+        dtype (torch.device, optional):
+            Determnines the precision that resampling kernel is pre-computed and cached. If not provided,
+            kernel is computed with ``torch.float64`` then cached as ``torch.float32``.
+            If you need higher precision, provide ``torch.float64``, and the pre-computed kernel is computed and
+            cached as ``torch.float64``. If you use resample with lower precision, then instead of providing this
+            providing this argument, please use ``Resample.to(dtype)``, so that the kernel generation is still
+            carried out on ``torch.float64``.
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.Resample(sample_rate, sample_rate/10)
+        >>> waveform = transform(waveform)
+    """
+
+    def __init__(
+        self,
+        orig_freq: int = 16000,
+        new_freq: int = 16000,
+        resampling_method: str = "sinc_interp_hann",
+        lowpass_filter_width: int = 6,
+        rolloff: float = 0.99,
+        beta: Optional[float] = None,
+        *,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__()
+
+        self.orig_freq = orig_freq
+        self.new_freq = new_freq
+        self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq))
+        self.resampling_method = resampling_method
+        self.lowpass_filter_width = lowpass_filter_width
+        self.rolloff = rolloff
+        self.beta = beta
+
+        if self.orig_freq != self.new_freq:
+            kernel, self.width = _get_sinc_resample_kernel(
+                self.orig_freq,
+                self.new_freq,
+                self.gcd,
+                self.lowpass_filter_width,
+                self.rolloff,
+                self.resampling_method,
+                beta,
+                dtype=dtype,
+            )
+            self.register_buffer("kernel", kernel)
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Output signal of dimension (..., time).
+        """
+        if self.orig_freq == self.new_freq:
+            return waveform
+        return _apply_sinc_resample_kernel(waveform, self.orig_freq, self.new_freq, self.gcd, self.kernel, self.width)
+
+
+class ComputeDeltas(torch.nn.Module):
+    r"""Compute delta coefficients of a tensor, usually a spectrogram.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    See `torchaudio.functional.compute_deltas` for more details.
+
+    Args:
+        win_length (int, optional): The window length used for computing delta. (Default: ``5``)
+        mode (str, optional): Mode parameter passed to padding. (Default: ``"replicate"``)
+    """
+    __constants__ = ["win_length"]
+
+    def __init__(self, win_length: int = 5, mode: str = "replicate") -> None:
+        super(ComputeDeltas, self).__init__()
+        self.win_length = win_length
+        self.mode = mode
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): Tensor of audio of dimension (..., freq, time).
+
+        Returns:
+            Tensor: Tensor of deltas of dimension (..., freq, time).
+        """
+        return F.compute_deltas(specgram, win_length=self.win_length, mode=self.mode)
+
+
+class TimeStretch(torch.nn.Module):
+    r"""Stretch stft in time without modifying pitch for a given rate.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Proposed in *SpecAugment* :cite:`specaugment`.
+
+    Args:
+        hop_length (int or None, optional): Length of hop between STFT windows.
+            (Default: ``n_fft // 2``, where ``n_fft == (n_freq - 1) * 2``)
+        n_freq (int, optional): number of filter banks from stft. (Default: ``201``)
+        fixed_rate (float or None, optional): rate to speed up or slow down by.
+            If None is provided, rate must be passed to the forward method. (Default: ``None``)
+
+    .. note::
+
+       The expected input is raw, complex-valued spectrogram.
+
+    Example
+        >>> spectrogram = torchaudio.transforms.Spectrogram(power=None)
+        >>> stretch = torchaudio.transforms.TimeStretch()
+        >>>
+        >>> original = spectrogram(waveform)
+        >>> stretched_1_2 = stretch(original, 1.2)
+        >>> stretched_0_9 = stretch(original, 0.9)
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch.png
+           :width: 600
+           :alt: The visualization of stretched spectrograms.
+    """
+    __constants__ = ["fixed_rate"]
+
+    def __init__(self, hop_length: Optional[int] = None, n_freq: int = 201, fixed_rate: Optional[float] = None) -> None:
+        super(TimeStretch, self).__init__()
+
+        self.fixed_rate = fixed_rate
+
+        n_fft = (n_freq - 1) * 2
+        hop_length = hop_length if hop_length is not None else n_fft // 2
+        self.register_buffer("phase_advance", torch.linspace(0, math.pi * hop_length, n_freq)[..., None])
+
+    def forward(self, complex_specgrams: Tensor, overriding_rate: Optional[float] = None) -> Tensor:
+        r"""
+        Args:
+            complex_specgrams (Tensor):
+                A tensor of dimension `(..., freq, num_frame)` with complex dtype.
+            overriding_rate (float or None, optional): speed up to apply to this batch.
+                If no rate is passed, use ``self.fixed_rate``. (Default: ``None``)
+
+        Returns:
+            Tensor:
+                Stretched spectrogram. The resulting tensor is of the corresponding complex dtype
+                as the input spectrogram, and the number of frames is changed to ``ceil(num_frame / rate)``.
+        """
+        if not torch.is_complex(complex_specgrams):
+            warnings.warn(
+                "The input to TimeStretch must be complex type. "
+                "Providing non-complex tensor produces invalid results.",
+                stacklevel=4,
+            )
+
+        if overriding_rate is None:
+            if self.fixed_rate is None:
+                raise ValueError("If no fixed_rate is specified, must pass a valid rate to the forward method.")
+            rate = self.fixed_rate
+        else:
+            rate = overriding_rate
+        return F.phase_vocoder(complex_specgrams, rate, self.phase_advance)
+
+
+class Fade(torch.nn.Module):
+    r"""Add a fade in and/or fade out to an waveform.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        fade_in_len (int, optional): Length of fade-in (time frames). (Default: ``0``)
+        fade_out_len (int, optional): Length of fade-out (time frames). (Default: ``0``)
+        fade_shape (str, optional): Shape of fade. Must be one of: "quarter_sine",
+            ``"half_sine"``, ``"linear"``, ``"logarithmic"``, ``"exponential"``.
+            (Default: ``"linear"``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.Fade(fade_in_len=sample_rate, fade_out_len=2 * sample_rate, fade_shape="linear")
+        >>> faded_waveform = transform(waveform)
+    """
+
+    def __init__(self, fade_in_len: int = 0, fade_out_len: int = 0, fade_shape: str = "linear") -> None:
+        super(Fade, self).__init__()
+        self.fade_in_len = fade_in_len
+        self.fade_out_len = fade_out_len
+        self.fade_shape = fade_shape
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension `(..., time)`.
+
+        Returns:
+            Tensor: Tensor of audio of dimension `(..., time)`.
+        """
+        waveform_length = waveform.size()[-1]
+        device = waveform.device
+        return self._fade_in(waveform_length, device) * self._fade_out(waveform_length, device) * waveform
+
+    def _fade_in(self, waveform_length: int, device: torch.device) -> Tensor:
+        fade = torch.linspace(0, 1, self.fade_in_len, device=device)
+        ones = torch.ones(waveform_length - self.fade_in_len, device=device)
+
+        if self.fade_shape == "linear":
+            fade = fade
+
+        if self.fade_shape == "exponential":
+            fade = torch.pow(2, (fade - 1)) * fade
+
+        if self.fade_shape == "logarithmic":
+            fade = torch.log10(0.1 + fade) + 1
+
+        if self.fade_shape == "quarter_sine":
+            fade = torch.sin(fade * math.pi / 2)
+
+        if self.fade_shape == "half_sine":
+            fade = torch.sin(fade * math.pi - math.pi / 2) / 2 + 0.5
+
+        return torch.cat((fade, ones)).clamp_(0, 1)
+
+    def _fade_out(self, waveform_length: int, device: torch.device) -> Tensor:
+        fade = torch.linspace(0, 1, self.fade_out_len, device=device)
+        ones = torch.ones(waveform_length - self.fade_out_len, device=device)
+
+        if self.fade_shape == "linear":
+            fade = -fade + 1
+
+        if self.fade_shape == "exponential":
+            fade = torch.pow(2, -fade) * (1 - fade)
+
+        if self.fade_shape == "logarithmic":
+            fade = torch.log10(1.1 - fade) + 1
+
+        if self.fade_shape == "quarter_sine":
+            fade = torch.sin(fade * math.pi / 2 + math.pi / 2)
+
+        if self.fade_shape == "half_sine":
+            fade = torch.sin(fade * math.pi + math.pi / 2) / 2 + 0.5
+
+        return torch.cat((ones, fade)).clamp_(0, 1)
+
+
+class _AxisMasking(torch.nn.Module):
+    r"""Apply masking to a spectrogram.
+
+    Args:
+        mask_param (int): Maximum possible length of the mask.
+        axis (int): What dimension the mask is applied on (assuming the tensor is 3D).
+            For frequency masking, axis = 1.
+            For time masking, axis = 2.
+        iid_masks (bool): Applies iid masks to each of the examples in the batch dimension.
+            This option is applicable only when the dimension of the input tensor is >= 3.
+        p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
+    """
+    __constants__ = ["mask_param", "axis", "iid_masks", "p"]
+
+    def __init__(self, mask_param: int, axis: int, iid_masks: bool, p: float = 1.0) -> None:
+        super(_AxisMasking, self).__init__()
+        self.mask_param = mask_param
+        self.axis = axis
+        self.iid_masks = iid_masks
+        self.p = p
+
+    def forward(self, specgram: Tensor, mask_value: float = 0.0) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): Tensor of dimension `(..., freq, time)`.
+            mask_value (float): Value to assign to the masked columns.
+
+        Returns:
+            Tensor: Masked spectrogram of dimensions `(..., freq, time)`.
+        """
+        # if iid_masks flag marked and specgram has a batch dimension
+        # self.axis + specgram.dim() - 3 gives the time/frequency dimension (last two dimensions)
+        # for input tensor for which the dimension is not 3.
+        if self.iid_masks:
+            return F.mask_along_axis_iid(
+                specgram, self.mask_param, mask_value, self.axis + specgram.dim() - 3, p=self.p
+            )
+        else:
+            return F.mask_along_axis(specgram, self.mask_param, mask_value, self.axis + specgram.dim() - 3, p=self.p)
+
+
+class FrequencyMasking(_AxisMasking):
+    r"""Apply masking to a spectrogram in the frequency domain.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Proposed in *SpecAugment* :cite:`specaugment`.
+
+    Args:
+        freq_mask_param (int): maximum possible length of the mask.
+            Indices uniformly sampled from [0, freq_mask_param).
+        iid_masks (bool, optional): whether to apply different masks to each
+            example/channel in the batch. (Default: ``False``)
+            This option is applicable only when the input tensor >= 3D.
+
+    Example
+        >>> spectrogram = torchaudio.transforms.Spectrogram()
+        >>> masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
+        >>>
+        >>> original = spectrogram(waveform)
+        >>> masked = masking(original)
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking1.png
+           :alt: The original spectrogram
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking2.png
+           :alt: The spectrogram masked along frequency axis
+    """
+
+    def __init__(self, freq_mask_param: int, iid_masks: bool = False) -> None:
+        super(FrequencyMasking, self).__init__(freq_mask_param, 1, iid_masks)
+
+
+class TimeMasking(_AxisMasking):
+    r"""Apply masking to a spectrogram in the time domain.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Proposed in *SpecAugment* :cite:`specaugment`.
+
+    Args:
+        time_mask_param (int): maximum possible length of the mask.
+            Indices uniformly sampled from [0, time_mask_param).
+        iid_masks (bool, optional): whether to apply different masks to each
+            example/channel in the batch. (Default: ``False``)
+            This option is applicable only when the input tensor >= 3D.
+        p (float, optional): maximum proportion of time steps that can be masked.
+            Must be within range [0.0, 1.0]. (Default: 1.0)
+
+    Example
+        >>> spectrogram = torchaudio.transforms.Spectrogram()
+        >>> masking = torchaudio.transforms.TimeMasking(time_mask_param=80)
+        >>>
+        >>> original = spectrogram(waveform)
+        >>> masked = masking(original)
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking1.png
+           :alt: The original spectrogram
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking2.png
+           :alt: The spectrogram masked along time axis
+    """
+
+    def __init__(self, time_mask_param: int, iid_masks: bool = False, p: float = 1.0) -> None:
+        if not 0.0 <= p <= 1.0:
+            raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
+        super(TimeMasking, self).__init__(time_mask_param, 2, iid_masks, p=p)
+
+
+class SpecAugment(torch.nn.Module):
+    r"""Apply time and frequency masking to a spectrogram.
+    Args:
+        n_time_masks (int): Number of time masks. If its value is zero, no time masking will be applied.
+        time_mask_param (int): Maximum possible length of the time mask.
+        n_freq_masks (int): Number of frequency masks. If its value is zero, no frequency masking will be applied.
+        freq_mask_param (int): Maximum possible length of the frequency mask.
+        iid_masks (bool, optional): Applies iid masks to each of the examples in the batch dimension.
+            This option is applicable only when the input tensor is 4D. (Default: ``True``)
+        p (float, optional): maximum proportion of time steps that can be masked.
+            Must be within range [0.0, 1.0]. (Default: 1.0)
+        zero_masking (bool, optional): If ``True``, use 0 as the mask value,
+            else use mean of the input tensor. (Default: ``False``)
+    """
+    __constants__ = [
+        "n_time_masks",
+        "time_mask_param",
+        "n_freq_masks",
+        "freq_mask_param",
+        "iid_masks",
+        "p",
+        "zero_masking",
+    ]
+
+    def __init__(
+        self,
+        n_time_masks: int,
+        time_mask_param: int,
+        n_freq_masks: int,
+        freq_mask_param: int,
+        iid_masks: bool = True,
+        p: float = 1.0,
+        zero_masking: bool = False,
+    ) -> None:
+        super(SpecAugment, self).__init__()
+        self.n_time_masks = n_time_masks
+        self.time_mask_param = time_mask_param
+        self.n_freq_masks = n_freq_masks
+        self.freq_mask_param = freq_mask_param
+        self.iid_masks = iid_masks
+        self.p = p
+        self.zero_masking = zero_masking
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): Tensor of shape `(..., freq, time)`.
+        Returns:
+            Tensor: Masked spectrogram of shape `(..., freq, time)`.
+        """
+        if self.zero_masking:
+            mask_value = 0.0
+        else:
+            mask_value = specgram.mean()
+        time_dim = specgram.dim() - 1
+        freq_dim = time_dim - 1
+
+        if specgram.dim() > 2 and self.iid_masks is True:
+            for _ in range(self.n_time_masks):
+                specgram = F.mask_along_axis_iid(specgram, self.time_mask_param, mask_value, time_dim, p=self.p)
+            for _ in range(self.n_freq_masks):
+                specgram = F.mask_along_axis_iid(specgram, self.freq_mask_param, mask_value, freq_dim, p=self.p)
+        else:
+            for _ in range(self.n_time_masks):
+                specgram = F.mask_along_axis(specgram, self.time_mask_param, mask_value, time_dim, p=self.p)
+            for _ in range(self.n_freq_masks):
+                specgram = F.mask_along_axis(specgram, self.freq_mask_param, mask_value, freq_dim, p=self.p)
+
+        return specgram
+
+
+class Loudness(torch.nn.Module):
+    r"""Measure audio loudness according to the ITU-R BS.1770-4 recommendation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        sample_rate (int): Sample rate of audio signal.
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.Loudness(sample_rate)
+        >>> loudness = transform(waveform)
+
+    Reference:
+        - https://www.itu.int/rec/R-REC-BS.1770-4-201510-I/en
+    """
+    __constants__ = ["sample_rate"]
+
+    def __init__(self, sample_rate: int):
+        super(Loudness, self).__init__()
+        self.sample_rate = sample_rate
+
+    def forward(self, wavefrom: Tensor):
+        r"""
+        Args:
+            waveform(torch.Tensor): audio waveform of dimension `(..., channels, time)`
+
+        Returns:
+            Tensor: loudness estimates (LKFS)
+        """
+        return F.loudness(wavefrom, self.sample_rate)
+
+
+class Vol(torch.nn.Module):
+    r"""Adjust volume of waveform.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        gain (float): Interpreted according to the given gain_type:
+            If ``gain_type`` = ``amplitude``, ``gain`` is a positive amplitude ratio.
+            If ``gain_type`` = ``power``, ``gain`` is a power (voltage squared).
+            If ``gain_type`` = ``db``, ``gain`` is in decibels.
+        gain_type (str, optional): Type of gain. One of: ``amplitude``, ``power``, ``db`` (Default: ``amplitude``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.Vol(gain=0.5, gain_type="amplitude")
+        >>> quieter_waveform = transform(waveform)
+    """
+
+    def __init__(self, gain: float, gain_type: str = "amplitude"):
+        super(Vol, self).__init__()
+        self.gain = gain
+        self.gain_type = gain_type
+
+        if gain_type in ["amplitude", "power"] and gain < 0:
+            raise ValueError("If gain_type = amplitude or power, gain must be positive.")
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension `(..., time)`.
+
+        Returns:
+            Tensor: Tensor of audio of dimension `(..., time)`.
+        """
+        if self.gain_type == "amplitude":
+            waveform = waveform * self.gain
+
+        if self.gain_type == "db":
+            waveform = F.gain(waveform, self.gain)
+
+        if self.gain_type == "power":
+            waveform = F.gain(waveform, 10 * math.log10(self.gain))
+
+        return torch.clamp(waveform, -1, 1)
+
+
+class SlidingWindowCmn(torch.nn.Module):
+    r"""
+    Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
+        min_cmn_window (int, optional):  Minimum CMN window used at start of decoding (adds latency only at start).
+            Only applicable if center == false, ignored if center==true (int, default = 100)
+        center (bool, optional): If true, use a window centered on the current frame
+            (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
+        norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.SlidingWindowCmn(cmn_window=1000)
+        >>> cmn_waveform = transform(waveform)
+    """
+
+    def __init__(
+        self, cmn_window: int = 600, min_cmn_window: int = 100, center: bool = False, norm_vars: bool = False
+    ) -> None:
+        super().__init__()
+        self.cmn_window = cmn_window
+        self.min_cmn_window = min_cmn_window
+        self.center = center
+        self.norm_vars = norm_vars
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)`.
+
+        Returns:
+            Tensor: Tensor of spectrogram of dimension `(..., time, freq)`.
+        """
+        cmn_specgram = F.sliding_window_cmn(specgram, self.cmn_window, self.min_cmn_window, self.center, self.norm_vars)
+        return cmn_specgram
+
+
+class Vad(torch.nn.Module):
+    r"""Voice Activity Detector. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
+    The algorithm currently uses a simple cepstral power measurement to detect voice,
+    so may be fooled by other things, especially music.
+
+    The effect can trim only from the front of the audio,
+    so in order to trim from the back, the reverse effect must also be used.
+
+    Args:
+        sample_rate (int): Sample rate of audio signal.
+        trigger_level (float, optional): The measurement level used to trigger activity detection.
+            This may need to be changed depending on the noise level, signal level,
+            and other characteristics of the input audio. (Default: 7.0)
+        trigger_time (float, optional): The time constant (in seconds)
+            used to help ignore short bursts of sound. (Default: 0.25)
+        search_time (float, optional): The amount of audio (in seconds)
+            to search for quieter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 1.0)
+        allowed_gap (float, optional): The allowed gap (in seconds) between
+            quiteter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 0.25)
+        pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
+            before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
+        boot_time (float, optional) The algorithm (internally) uses adaptive noise
+            estimation/reduction in order to detect the start of the wanted audio.
+            This option sets the time for the initial noise estimate. (Default: 0.35)
+        noise_up_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is increasing. (Default: 0.1)
+        noise_down_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is decreasing. (Default: 0.01)
+        noise_reduction_amount (float, optional) Amount of noise reduction to use in
+            the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
+        measure_freq (float, optional) Frequency of the algorithm’s
+            processing/measurements. (Default: 20.0)
+        measure_duration: (float or None, optional) Measurement duration.
+            (Default: Twice the measurement period; i.e. with overlap.)
+        measure_smooth_time (float, optional) Time constant used to smooth
+            spectral measurements. (Default: 0.4)
+        hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
+            at the input to the detector algorithm. (Default: 50.0)
+        lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
+            at the input to the detector algorithm. (Default: 6000.0)
+        hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
+            in the detector algorithm. (Default: 150.0)
+        lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
+            in the detector algorithm. (Default: 2000.0)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> waveform_reversed, sample_rate = apply_effects_tensor(waveform, sample_rate, [["reverse"]])
+        >>> transform = transforms.Vad(sample_rate=sample_rate, trigger_level=7.5)
+        >>> waveform_reversed_front_trim = transform(waveform_reversed)
+        >>> waveform_end_trim, sample_rate = apply_effects_tensor(
+        >>>     waveform_reversed_front_trim, sample_rate, [["reverse"]]
+        >>> )
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        trigger_level: float = 7.0,
+        trigger_time: float = 0.25,
+        search_time: float = 1.0,
+        allowed_gap: float = 0.25,
+        pre_trigger_time: float = 0.0,
+        boot_time: float = 0.35,
+        noise_up_time: float = 0.1,
+        noise_down_time: float = 0.01,
+        noise_reduction_amount: float = 1.35,
+        measure_freq: float = 20.0,
+        measure_duration: Optional[float] = None,
+        measure_smooth_time: float = 0.4,
+        hp_filter_freq: float = 50.0,
+        lp_filter_freq: float = 6000.0,
+        hp_lifter_freq: float = 150.0,
+        lp_lifter_freq: float = 2000.0,
+    ) -> None:
+        super().__init__()
+
+        self.sample_rate = sample_rate
+        self.trigger_level = trigger_level
+        self.trigger_time = trigger_time
+        self.search_time = search_time
+        self.allowed_gap = allowed_gap
+        self.pre_trigger_time = pre_trigger_time
+        self.boot_time = boot_time
+        self.noise_up_time = noise_up_time
+        self.noise_down_time = noise_down_time
+        self.noise_reduction_amount = noise_reduction_amount
+        self.measure_freq = measure_freq
+        self.measure_duration = measure_duration
+        self.measure_smooth_time = measure_smooth_time
+        self.hp_filter_freq = hp_filter_freq
+        self.lp_filter_freq = lp_filter_freq
+        self.hp_lifter_freq = hp_lifter_freq
+        self.lp_lifter_freq = lp_lifter_freq
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
+                Tensor of shape `(channels, time)` is treated as a multi-channel recording
+                of the same event and the resulting output will be trimmed to the earliest
+                voice activity in any channel.
+        """
+        return F.vad(
+            waveform=waveform,
+            sample_rate=self.sample_rate,
+            trigger_level=self.trigger_level,
+            trigger_time=self.trigger_time,
+            search_time=self.search_time,
+            allowed_gap=self.allowed_gap,
+            pre_trigger_time=self.pre_trigger_time,
+            boot_time=self.boot_time,
+            noise_up_time=self.noise_up_time,
+            noise_down_time=self.noise_down_time,
+            noise_reduction_amount=self.noise_reduction_amount,
+            measure_freq=self.measure_freq,
+            measure_duration=self.measure_duration,
+            measure_smooth_time=self.measure_smooth_time,
+            hp_filter_freq=self.hp_filter_freq,
+            lp_filter_freq=self.lp_filter_freq,
+            hp_lifter_freq=self.hp_lifter_freq,
+            lp_lifter_freq=self.lp_lifter_freq,
+        )
+
+
+class SpectralCentroid(torch.nn.Module):
+    r"""Compute the spectral centroid for each channel along the time axis.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The spectral centroid is defined as the weighted average of the
+    frequency values, weighted by their magnitude.
+
+    Args:
+        sample_rate (int): Sample rate of audio signal.
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.SpectralCentroid(sample_rate)
+        >>> spectral_centroid = transform(waveform)  # (channel, time)
+    """
+    __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad"]
+
+    def __init__(
+        self,
+        sample_rate: int,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        pad: int = 0,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        wkwargs: Optional[dict] = None,
+    ) -> None:
+        super(SpectralCentroid, self).__init__()
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        self.pad = pad
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension `(..., time)`.
+
+        Returns:
+            Tensor: Spectral Centroid of size `(..., time)`.
+        """
+
+        return F.spectral_centroid(
+            waveform, self.sample_rate, self.pad, self.window, self.n_fft, self.hop_length, self.win_length
+        )
+
+
+class PitchShift(LazyModuleMixin, torch.nn.Module):
+    r"""Shift the pitch of a waveform by ``n_steps`` steps.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        waveform (Tensor): The input waveform of shape `(..., time)`.
+        sample_rate (int): Sample rate of `waveform`.
+        n_steps (int): The (fractional) steps to shift `waveform`.
+        bins_per_octave (int, optional): The number of steps per octave (Default : ``12``).
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins (Default: ``512``).
+        win_length (int or None, optional): Window size. If None, then ``n_fft`` is used. (Default: ``None``).
+        hop_length (int or None, optional): Length of hop between STFT windows. If None, then ``win_length // 4``
+            is used (Default: ``None``).
+        window (Tensor or None, optional): Window tensor that is applied/multiplied to each frame/window.
+            If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``).
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.PitchShift(sample_rate, 4)
+        >>> waveform_shift = transform(waveform)  # (channel, time)
+    """
+    __constants__ = ["sample_rate", "n_steps", "bins_per_octave", "n_fft", "win_length", "hop_length"]
+
+    kernel: UninitializedParameter
+    width: int
+
+    def __init__(
+        self,
+        sample_rate: int,
+        n_steps: int,
+        bins_per_octave: int = 12,
+        n_fft: int = 512,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        wkwargs: Optional[dict] = None,
+    ) -> None:
+        super().__init__()
+        self.n_steps = n_steps
+        self.bins_per_octave = bins_per_octave
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 4
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        rate = 2.0 ** (-float(n_steps) / bins_per_octave)
+        self.orig_freq = int(sample_rate / rate)
+        self.gcd = math.gcd(int(self.orig_freq), int(sample_rate))
+
+        if self.orig_freq != sample_rate:
+            self.width = -1
+            self.kernel = UninitializedParameter(device=None, dtype=None)
+
+    def initialize_parameters(self, input):
+        if self.has_uninitialized_params():
+            if self.orig_freq != self.sample_rate:
+                with torch.no_grad():
+                    kernel, self.width = _get_sinc_resample_kernel(
+                        self.orig_freq,
+                        self.sample_rate,
+                        self.gcd,
+                        dtype=input.dtype,
+                        device=input.device,
+                    )
+                    self.kernel.materialize(kernel.shape)
+                    self.kernel.copy_(kernel)
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension `(..., time)`.
+
+        Returns:
+            Tensor: The pitch-shifted audio of shape `(..., time)`.
+        """
+        shape = waveform.size()
+
+        waveform_stretch = _stretch_waveform(
+            waveform,
+            self.n_steps,
+            self.bins_per_octave,
+            self.n_fft,
+            self.win_length,
+            self.hop_length,
+            self.window,
+        )
+
+        if self.orig_freq != self.sample_rate:
+            waveform_shift = _apply_sinc_resample_kernel(
+                waveform_stretch,
+                self.orig_freq,
+                self.sample_rate,
+                self.gcd,
+                self.kernel,
+                self.width,
+            )
+        else:
+            waveform_shift = waveform_stretch
+
+        return _fix_waveform_shape(
+            waveform_shift,
+            shape,
+        )
+
+
+class RNNTLoss(torch.nn.Module):
+    """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
+    :cite:`graves2012sequence`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The RNN Transducer loss extends the CTC loss by defining a distribution over output
+    sequences of all lengths, and by jointly modelling both input-output and output-output
+    dependencies.
+
+    Args:
+        blank (int, optional): blank label (Default: ``-1``)
+        clamp (float, optional): clamp for gradients (Default: ``-1``)
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``"none"`` | ``"mean"`` | ``"sum"``. (Default: ``"mean"``)
+        fused_log_softmax (bool): set to False if calling log_softmax outside of loss (Default: ``True``)
+
+    Example
+        >>> # Hypothetical values
+        >>> logits = torch.tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
+        >>>                          [0.1, 0.1, 0.6, 0.1, 0.1],
+        >>>                          [0.1, 0.1, 0.2, 0.8, 0.1]],
+        >>>                         [[0.1, 0.6, 0.1, 0.1, 0.1],
+        >>>                          [0.1, 0.1, 0.2, 0.1, 0.1],
+        >>>                          [0.7, 0.1, 0.2, 0.1, 0.1]]]],
+        >>>                       dtype=torch.float32,
+        >>>                       requires_grad=True)
+        >>> targets = torch.tensor([[1, 2]], dtype=torch.int)
+        >>> logit_lengths = torch.tensor([2], dtype=torch.int)
+        >>> target_lengths = torch.tensor([2], dtype=torch.int)
+        >>> transform = transforms.RNNTLoss(blank=0)
+        >>> loss = transform(logits, targets, logit_lengths, target_lengths)
+        >>> loss.backward()
+    """
+
+    def __init__(
+        self,
+        blank: int = -1,
+        clamp: float = -1.0,
+        reduction: str = "mean",
+        fused_log_softmax: bool = True,
+    ):
+        super().__init__()
+        self.blank = blank
+        self.clamp = clamp
+        self.reduction = reduction
+        self.fused_log_softmax = fused_log_softmax
+
+    def forward(
+        self,
+        logits: Tensor,
+        targets: Tensor,
+        logit_lengths: Tensor,
+        target_lengths: Tensor,
+    ):
+        """
+        Args:
+            logits (Tensor): Tensor of dimension `(batch, max seq length, max target length + 1, class)`
+                containing output from joiner
+            targets (Tensor): Tensor of dimension `(batch, max target length)` containing targets with zero padded
+            logit_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of each sequence from encoder
+            target_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of targets for each sequence
+        Returns:
+            Tensor: Loss with the reduction option applied. If ``reduction`` is  ``"none"``, then size (batch),
+            otherwise scalar.
+        """
+        return F.rnnt_loss(
+            logits,
+            targets,
+            logit_lengths,
+            target_lengths,
+            self.blank,
+            self.clamp,
+            self.reduction,
+            self.fused_log_softmax,
+        )
+
+
+class Convolve(torch.nn.Module):
+    r"""
+    Convolves inputs along their last dimension using the direct method.
+    Note that, in contrast to :class:`torch.nn.Conv1d`, which actually applies the valid cross-correlation
+    operator, this module applies the true `convolution`_ operator.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        mode (str, optional): Must be one of ("full", "valid", "same").
+
+            * "full": Returns the full convolution result, with shape `(..., N + M - 1)`, where
+              `N` and `M` are the trailing dimensions of the two inputs. (Default)
+            * "valid": Returns the segment of the full convolution result corresponding to where
+              the two inputs overlap completely, with shape `(..., max(N, M) - min(N, M) + 1)`.
+            * "same": Returns the center segment of the full convolution result, with shape `(..., N)`.
+
+    .. _convolution:
+        https://en.wikipedia.org/wiki/Convolution
+    """
+
+    def __init__(self, mode: str = "full") -> None:
+        _check_convolve_mode(mode)
+
+        super().__init__()
+        self.mode = mode
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            x (torch.Tensor): First convolution operand, with shape `(..., N)`.
+            y (torch.Tensor): Second convolution operand, with shape `(..., M)`
+                (leading dimensions must be broadcast-able with those of ``x``).
+
+        Returns:
+            torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., L)`, where
+            the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
+        """
+        return F.convolve(x, y, mode=self.mode)
+
+
+class FFTConvolve(torch.nn.Module):
+    r"""
+    Convolves inputs along their last dimension using FFT. For inputs with large last dimensions, this module
+    is generally much faster than :class:`Convolve`.
+    Note that, in contrast to :class:`torch.nn.Conv1d`, which actually applies the valid cross-correlation
+    operator, this module applies the true `convolution`_ operator.
+    Also note that this module can only output float tensors (int tensor inputs will be cast to float).
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        mode (str, optional): Must be one of ("full", "valid", "same").
+
+            * "full": Returns the full convolution result, with shape `(..., N + M - 1)`, where
+              `N` and `M` are the trailing dimensions of the two inputs. (Default)
+            * "valid": Returns the segment of the full convolution result corresponding to where
+              the two inputs overlap completely, with shape `(..., max(N, M) - min(N, M) + 1)`.
+            * "same": Returns the center segment of the full convolution result, with shape `(..., N)`.
+
+    .. _convolution:
+        https://en.wikipedia.org/wiki/Convolution
+    """
+
+    def __init__(self, mode: str = "full") -> None:
+        _check_convolve_mode(mode)
+
+        super().__init__()
+        self.mode = mode
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            x (torch.Tensor): First convolution operand, with shape `(..., N)`.
+            y (torch.Tensor): Second convolution operand, with shape `(..., M)`
+                (leading dimensions must be broadcast-able with those of ``x``).
+
+        Returns:
+            torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., L)`, where
+            the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
+        """
+        return F.fftconvolve(x, y, mode=self.mode)
+
+
+def _source_target_sample_rate(orig_freq: int, speed: float) -> Tuple[int, int]:
+    source_sample_rate = int(speed * orig_freq)
+    target_sample_rate = int(orig_freq)
+    gcd = math.gcd(source_sample_rate, target_sample_rate)
+    return source_sample_rate // gcd, target_sample_rate // gcd
+
+
+class Speed(torch.nn.Module):
+    r"""Adjusts waveform speed.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        orig_freq (int): Original frequency of the signals in ``waveform``.
+        factor (float): Factor by which to adjust speed of input. Values greater than 1.0
+            compress ``waveform`` in time, whereas values less than 1.0 stretch ``waveform`` in time.
+    """
+
+    def __init__(self, orig_freq, factor) -> None:
+        super().__init__()
+
+        self.orig_freq = orig_freq
+        self.factor = factor
+
+        self.source_sample_rate, self.target_sample_rate = _source_target_sample_rate(orig_freq, factor)
+        self.resampler = Resample(orig_freq=self.source_sample_rate, new_freq=self.target_sample_rate)
+
+    def forward(self, waveform, lengths: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        r"""
+        Args:
+            waveform (torch.Tensor): Input signals, with shape `(..., time)`.
+            lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform``, with shape `(...)`.
+                If ``None``, all elements in ``waveform`` are treated as valid. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor or None):
+                torch.Tensor
+                    Speed-adjusted waveform, with shape `(..., new_time).`
+                torch.Tensor or None
+                    If ``lengths`` is not ``None``, valid lengths of signals in speed-adjusted waveform,
+                    with shape `(...)`; otherwise, ``None``.
+        """
+
+        if lengths is None:
+            out_lengths = None
+        else:
+            out_lengths = torch.ceil(lengths * self.target_sample_rate / self.source_sample_rate).to(lengths.dtype)
+
+        return self.resampler(waveform), out_lengths
+
+
+class SpeedPerturbation(torch.nn.Module):
+    r"""Applies the speed perturbation augmentation introduced in
+    *Audio augmentation for speech recognition* :cite:`ko15_interspeech`. For a given input,
+    the module samples a speed-up factor from ``factors`` uniformly at random and adjusts
+    the speed of the input by that factor.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        orig_freq (int): Original frequency of the signals in ``waveform``.
+        factors (Sequence[float]): Factors by which to adjust speed of input. Values greater than 1.0
+            compress ``waveform`` in time, whereas values less than 1.0 stretch ``waveform`` in time.
+
+    Example
+        >>> speed_perturb = SpeedPerturbation(16000, [0.9, 1.1, 1.0, 1.0, 1.0])
+        >>> # waveform speed will be adjusted by factor 0.9 with 20% probability,
+        >>> # 1.1 with 20% probability, and 1.0 (i.e. kept the same) with 60% probability.
+        >>> speed_perturbed_waveform = speed_perturb(waveform, lengths)
+    """
+
+    def __init__(self, orig_freq: int, factors: Sequence[float]) -> None:
+        super().__init__()
+
+        self.speeders = torch.nn.ModuleList([Speed(orig_freq=orig_freq, factor=factor) for factor in factors])
+
+    def forward(
+        self, waveform: torch.Tensor, lengths: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        r"""
+        Args:
+            waveform (torch.Tensor): Input signals, with shape `(..., time)`.
+            lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform``, with shape `(...)`.
+                If ``None``, all elements in ``waveform`` are treated as valid. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor or None):
+                torch.Tensor
+                    Speed-adjusted waveform, with shape `(..., new_time).`
+                torch.Tensor or None
+                    If ``lengths`` is not ``None``, valid lengths of signals in speed-adjusted waveform,
+                    with shape `(...)`; otherwise, ``None``.
+        """
+
+        idx = int(torch.randint(len(self.speeders), ()))
+        # NOTE: we do this because TorchScript doesn't allow for
+        # indexing ModuleList instances with non-literals.
+        for speeder_idx, speeder in enumerate(self.speeders):
+            if idx == speeder_idx:
+                return speeder(waveform, lengths)
+        raise RuntimeError("Speeder not found; execution should have never reached here.")
+
+
+class AddNoise(torch.nn.Module):
+    r"""Scales and adds noise to waveform per signal-to-noise ratio.
+    See :meth:`torchaudio.functional.add_noise` for more details.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+    """
+
+    def forward(
+        self, waveform: torch.Tensor, noise: torch.Tensor, snr: torch.Tensor, lengths: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            waveform (torch.Tensor): Input waveform, with shape `(..., L)`.
+            noise (torch.Tensor): Noise, with shape `(..., L)` (same shape as ``waveform``).
+            snr (torch.Tensor): Signal-to-noise ratios in dB, with shape `(...,)`.
+            lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform`` and ``noise``,
+            with shape `(...,)` (leading dimensions must match those of ``waveform``). If ``None``, all
+            elements in ``waveform`` and ``noise`` are treated as valid. (Default: ``None``)
+
+        Returns:
+            torch.Tensor: Result of scaling and adding ``noise`` to ``waveform``, with shape `(..., L)`
+            (same shape as ``waveform``).
+        """
+        return F.add_noise(waveform, noise, snr, lengths)
+
+
+class Preemphasis(torch.nn.Module):
+    r"""Pre-emphasizes a waveform along its last dimension.
+    See :meth:`torchaudio.functional.preemphasis` for more details.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        coeff (float, optional): Pre-emphasis coefficient. Typically between 0.0 and 1.0.
+            (Default: 0.97)
+    """
+
+    def __init__(self, coeff: float = 0.97) -> None:
+        super().__init__()
+        self.coeff = coeff
+
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            waveform (torch.Tensor): Waveform, with shape `(..., N)`.
+
+        Returns:
+            torch.Tensor: Pre-emphasized waveform, with shape `(..., N)`.
+        """
+        return F.preemphasis(waveform, coeff=self.coeff)
+
+
+class Deemphasis(torch.nn.Module):
+    r"""De-emphasizes a waveform along its last dimension.
+    See :meth:`torchaudio.functional.deemphasis` for more details.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        coeff (float, optional): De-emphasis coefficient. Typically between 0.0 and 1.0.
+            (Default: 0.97)
+    """
+
+    def __init__(self, coeff: float = 0.97) -> None:
+        super().__init__()
+        self.coeff = coeff
+
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            waveform (torch.Tensor): Waveform, with shape `(..., N)`.
+
+        Returns:
+            torch.Tensor: De-emphasized waveform, with shape `(..., N)`.
+        """
+        return F.deemphasis(waveform, coeff=self.coeff)
diff --git a/MLPY/Lib/site-packages/torchaudio/utils/__init__.py b/MLPY/Lib/site-packages/torchaudio/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aff17a50e2fb49176c884e5d4b11970ae76cbab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/utils/__init__.py
@@ -0,0 +1,11 @@
+from torio.utils import ffmpeg_utils
+
+from . import sox_utils
+from .download import download_asset
+
+
+__all__ = [
+    "download_asset",
+    "sox_utils",
+    "ffmpeg_utils",
+]
diff --git a/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96329bad617c1280eaa84e2c0afec84e098d6408
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/download.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/download.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbcc7d931832e3c0eb473946d2768b3d1bae427d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/download.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/ffmpeg_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/ffmpeg_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70834785c266eb66e74c74142cdfd4c6826f96ae
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/ffmpeg_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/sox_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/sox_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ce5766281b495cf2d54492313186023a7bf72fc
Binary files /dev/null and b/MLPY/Lib/site-packages/torchaudio/utils/__pycache__/sox_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchaudio/utils/download.py b/MLPY/Lib/site-packages/torchaudio/utils/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f62c7ef1f56ba0ee25888e5fc14dcb2c665ba6a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/utils/download.py
@@ -0,0 +1,89 @@
+import hashlib
+import logging
+from os import PathLike
+from pathlib import Path
+from typing import Union
+
+import torch
+from torchaudio._internal import download_url_to_file
+
+_LG = logging.getLogger(__name__)
+
+
+def _get_local_path(key):
+    path = Path(torch.hub.get_dir()) / "torchaudio" / Path(key)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def _download(key, path, progress):
+    url = f"https://download.pytorch.org/torchaudio/{key}"
+    download_url_to_file(url, path, progress=progress)
+
+
+def _get_hash(path, hash, chunk_size=1028):
+    m = hashlib.sha256()
+    with open(path, "rb") as file:
+        data = file.read(chunk_size)
+        while data:
+            m.update(data)
+            data = file.read(chunk_size)
+    return m.hexdigest()
+
+
+def download_asset(
+    key: str,
+    hash: str = "",
+    path: Union[str, PathLike] = "",
+    *,
+    progress: bool = True,
+) -> str:
+    """Download and store torchaudio assets to local file system.
+
+    If a file exists at the download path, then that path is returned with or without
+    hash validation.
+
+    Args:
+        key (str): The asset identifier.
+        hash (str, optional):
+            The value of SHA256 hash of the asset. If provided, it is used to verify
+            the downloaded / cached object. If not provided, then no hash validation
+            is performed. This means if a file exists at the download path, then the path
+            is returned as-is without verifying the identity of the file.
+        path (path-like object, optional):
+            By default, the downloaded asset is saved in a directory under
+            :py:func:`torch.hub.get_dir` and intermediate directories based on the given `key`
+            are created.
+            This argument can be used to overwrite the target location.
+            When this argument is provided, all the intermediate directories have to be
+            created beforehand.
+        progress (bool): Whether to show progress bar for downloading. Default: ``True``.
+
+    Note:
+        Currently the valid key values are the route on ``download.pytorch.org/torchaudio``,
+        but this is an implementation detail.
+
+    Returns:
+        str: The path to the asset on the local file system.
+    """
+    path = path or _get_local_path(key)
+
+    if path.exists():
+        _LG.info("The local file (%s) exists. Skipping the download.", path)
+    else:
+        _LG.info("Downloading %s to %s", key, path)
+        _download(key, path, progress=progress)
+
+    if hash:
+        _LG.info("Verifying the hash value.")
+        digest = _get_hash(path, hash)
+
+        if digest != hash:
+            raise ValueError(
+                f"The hash value of the downloaded file ({path}), '{digest}' does not match "
+                f"the provided hash value, '{hash}'."
+            )
+
+        _LG.info("Hash validated.")
+
+    return str(path)
diff --git a/MLPY/Lib/site-packages/torchaudio/utils/ffmpeg_utils.py b/MLPY/Lib/site-packages/torchaudio/utils/ffmpeg_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a4bb3c4e3b621ff7b48062d1c4d3374a4459c90
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/utils/ffmpeg_utils.py
@@ -0,0 +1,11 @@
+"""Module to change the configuration of FFmpeg libraries (such as libavformat).
+
+It affects functionalities in :py:mod:`torchaudio.io` (and indirectly :py:func:`torchaudio.load`).
+"""
+
+
+# This file is just for BC.
+def __getattr__(item):
+    from torio.utils import ffmpeg_utils
+
+    return getattr(ffmpeg_utils, item)
diff --git a/MLPY/Lib/site-packages/torchaudio/utils/sox_utils.py b/MLPY/Lib/site-packages/torchaudio/utils/sox_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8975b4216f54e3ece63483bf91b49f10385f5785
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/utils/sox_utils.py
@@ -0,0 +1,99 @@
+"""Module to change the configuration of libsox, which is used by I/O functions like
+:py:mod:`~torchaudio.backend.sox_io_backend` and :py:mod:`~torchaudio.sox_effects`.
+"""
+
+from typing import Dict, List
+
+import torchaudio
+
+sox_ext = torchaudio._extension.lazy_import_sox_ext()
+
+
+def set_seed(seed: int):
+    """Set libsox's PRNG
+
+    Args:
+        seed (int): seed value. valid range is int32.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    sox_ext.set_seed(seed)
+
+
+def set_verbosity(verbosity: int):
+    """Set libsox's verbosity
+
+    Args:
+        verbosity (int): Set verbosity level of libsox.
+
+            * ``1`` failure messages
+            * ``2`` warnings
+            * ``3`` details of processing
+            * ``4``-``6`` increasing levels of debug messages
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    sox_ext.set_verbosity(verbosity)
+
+
+def set_buffer_size(buffer_size: int):
+    """Set buffer size for sox effect chain
+
+    Args:
+        buffer_size (int): Set the size in bytes of the buffers used for processing audio.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    sox_ext.set_buffer_size(buffer_size)
+
+
+def set_use_threads(use_threads: bool):
+    """Set multithread option for sox effect chain
+
+    Args:
+        use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing.
+            To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    sox_ext.set_use_threads(use_threads)
+
+
+def list_effects() -> Dict[str, str]:
+    """List the available sox effect names
+
+    Returns:
+        Dict[str, str]: Mapping from ``effect name`` to ``usage``
+    """
+    return dict(sox_ext.list_effects())
+
+
+def list_read_formats() -> List[str]:
+    """List the supported audio formats for read
+
+    Returns:
+        List[str]: List of supported audio formats
+    """
+    return sox_ext.list_read_formats()
+
+
+def list_write_formats() -> List[str]:
+    """List the supported audio formats for write
+
+    Returns:
+        List[str]: List of supported audio formats
+    """
+    return sox_ext.list_write_formats()
+
+
+def get_buffer_size() -> int:
+    """Get buffer size for sox effect chain
+
+    Returns:
+        int: size in bytes of buffers used for processing audio.
+    """
+    return sox_ext.get_buffer_size()
diff --git a/MLPY/Lib/site-packages/torchaudio/version.py b/MLPY/Lib/site-packages/torchaudio/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..986b1484a35bfb5827cb008f95365b37742d5b01
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchaudio/version.py
@@ -0,0 +1,2 @@
+__version__ = '2.3.1+cpu'
+git_version = '3edcf69e78a3c9a3077a11159861422440ec7d4a'
diff --git a/MLPY/Lib/site-packages/torchgen/__init__.py b/MLPY/Lib/site-packages/torchgen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8b61af2c4b58ff14ab7e3b24bf22e8ec6a95da0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/__init__.py
@@ -0,0 +1,10 @@
+"""torchgen
+
+This module contains codegeneration utilities for PyTorch. It is used to
+build PyTorch from source, but may also be used for out-of-tree projects
+that extend PyTorch.
+
+Note well that we provide no BC guarantees for torchgen. If you're interested
+in using torchgen and want the PyTorch team to be aware, please reach out
+on GitHub.
+"""
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f82e03da7a1bbcbd6d63d059c3eee7a5f7a15f3
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/code_template.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/code_template.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bae7cf86389ef923393e40238a8b319b57782f8
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/code_template.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/context.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/context.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca68294e00b4e9596b0326f739ef247138b4da35
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/context.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/gen.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/gen.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b35b2c4d863d0a8eaaad56c19df4d890a73e1c58
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/gen.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/gen_aoti_c_shim.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_aoti_c_shim.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79714eccfd7dd56ef5541fd77992c97383f71923
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_aoti_c_shim.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/gen_backend_stubs.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_backend_stubs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9135cfa518f47e2ec2ca17a880d0bf35def33a59
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_backend_stubs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/gen_executorch.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_executorch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3db6e22165c607a059580ce12d0f32867d96b3f8
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_executorch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/gen_functionalization_type.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_functionalization_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f828ff7edf70e3dc964624f8538f630efc6e052e
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_functionalization_type.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/gen_lazy_tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_lazy_tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..789762bc114d2ab9481070536369870d0fbf1167
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_lazy_tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/gen_vmap_plumbing.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_vmap_plumbing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd62533b71869a1539302814d8d63e36f3572a58
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/gen_vmap_plumbing.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/local.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/local.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd6fee321c21af0277b1e205294c9a27f5d398fe
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/local.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/model.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/model.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04561eb0a4cdc41f4ac86738f7f384cf7a221de9
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/model.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/native_function_generation.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/native_function_generation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d0d7af416babb98d9d96b66a4df9e3d6834a230
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/native_function_generation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d01da46b5bc71625cc115f3a353042efa3f153d8
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/__pycache__/yaml_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/__pycache__/yaml_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b5d843276b84b07e1367660c12ee292fe05f2e1
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/__pycache__/yaml_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__init__.py b/MLPY/Lib/site-packages/torchgen/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f717f5c6679af213a9cbc53097dbcf3eb16d125
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/autograd.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/autograd.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec72dc2d4644cfdf082407fba2cd49257d71797a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/autograd.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/cpp.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/cpp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee95402324f6540e9196b2d7e11f62ef6080a04f
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/cpp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/dispatcher.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/dispatcher.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b870cc7919cf983af356ccebaa3389dbbc0d6a0
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/dispatcher.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/functionalization.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/functionalization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6df5d09a406a3b746a33eb4b6d8f866ad2adf6c
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/functionalization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/lazy.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/lazy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a4377be4af138d7df7e729a92e04eeaf9295691
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/lazy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/meta.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/meta.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45724da7a063750d30619224af6298acc14c9739
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/meta.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/native.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/native.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5217ea5b59ca98416ed706f4e5a2fee2babb3a53
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/native.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/python.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/python.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9d874250872dfe19afd2e0228783e408a6ec357
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/python.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/structured.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/structured.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..850b3aa893efe116e9c7f1b1e8dc0ecc50139489
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/structured.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/translate.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/translate.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3adbd825adf8a3cf1e40802ee3451cac3bf54aa3
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/translate.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/ufunc.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/ufunc.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9120ae85398231d1f44d46af89746c4813590d34
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/ufunc.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/__pycache__/unboxing.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/__pycache__/unboxing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c36d391bf48a4153b70313e757f7028a0f7fb1ae
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/__pycache__/unboxing.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/autograd.py b/MLPY/Lib/site-packages/torchgen/api/autograd.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ac5011c90a765261c917e94ddb20ca443536f17
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/autograd.py
@@ -0,0 +1,853 @@
+import re
+from dataclasses import dataclass
+from typing import cast, Dict, List, Match, Optional, Sequence, Set, Tuple
+
+from torchgen import local
+
+from torchgen.api import cpp
+from torchgen.api.types import BaseCType, Binding, NamedCType, tensorListT
+from torchgen.model import (
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    NativeFunctionsViewGroup,
+    SchemaKind,
+    Type,
+)
+from torchgen.utils import IDENT_REGEX
+
+
+# Represents a saved attribute involved in backward calculation.
+# Note that it can be a derived property of an input argument, e.g.:
+# we could save `other.scalar_type()` instead of the entire `other` tensor.
+@dataclass(frozen=True)
+class SavedAttribute:
+    # The NamedCType holds the updated name and cpp type of the attribute
+    # for the name, Suffix is appended if it's derived property, e.g.: `other_scalar_type`
+    nctype: NamedCType
+
+    # The expression to read the derived property at save time, e.g.:
+    # `other.scalar_type()`.
+    expr: str
+
+
+# Represents a backward formula that calculates derivatives for one
+# or more tensors.
+@dataclass(frozen=True)
+class Derivative:
+    # The formula string (legit C++ expression).
+    # Note that expressions against input arguments have been replaced with the
+    # corresponding saved attributes.
+    # E.g.:
+    #  raw formula: `mul_tensor_backward(grad, self, other.scalar_type())`
+    #         here: `mul_tensor_backward(grad, self, other_scalar_type)`
+    formula: str
+
+    # The formula string before input argument replacement
+    original_formula: str
+
+    # Names of the arguments for which this formula calculates derivatives.
+    var_names: Tuple[str, ...]
+
+    # Saved inputs that are referenced by the formula.
+    saved_inputs: Tuple[SavedAttribute, ...]
+
+    # Saved outputs that are referenced by the formula.
+    saved_outputs: Tuple[SavedAttribute, ...]
+
+    # Gradients that are referenced by name in the formula.
+    named_gradients: Set[str]
+
+
+# Represents a forward formula that calculates forward derivatives
+# for one tensor.
+@dataclass(frozen=True)
+class ForwardDerivative:
+    # The formula string (legit C++ expression).
+    # Note that special keywords such as "linear" or "element_wise" have been
+    # replaced by the automatically generated formula.
+    formula: str
+
+    # Name of the output arguments for which this formula calculates forward
+    # derivatives
+    var_names: Tuple[str, ...]
+
+    # Type of the output arguments for which this formula calculates forward
+    # derivatives
+    var_types: Tuple[Type, ...]
+
+    # Inputs for which the forward derivatives are required for this formula
+    required_inputs_fw_grad: Optional[Tuple[str, ...]]
+
+    # Inputs for which the primal is required for this formula
+    required_inputs_primal: Optional[Tuple[str, ...]]
+
+    # Flag to specify if this formula requires the original value of self
+    # This is only used by inplace operations
+    required_original_self_value: bool
+
+    # If this formula is specified in derivatives.yaml or if we are re-using the
+    # out of place formula for inplace
+    is_reusing_outplace_formula: bool
+
+
+# Represents differentiability info for a NativeFunction.
+@dataclass(frozen=True)
+class DifferentiabilityInfo:
+    # The base name read from derivatives.yaml.
+    name: str
+
+    # The matching native function.
+    #
+    # There can be multiple NativeFunction having the same base name:
+    #  - different overloads with different types of input arguments;
+    #  - in-place/out/functional variants of the same function;
+    #
+    # We first use the schema string (under the 'name' key) in derivatives.yaml
+    # to find the NativeFunction having the same schema string.
+    # Then we find the in-place/out/functional variants of the matching function.
+    # Among these variants, we choose the one having the same name as the
+    # derivatives.yaml entry. If there is no exact match, then we choose the
+    # in-place variant.
+    # TODO: maybe the logic to search for all variants is no longer necessary?
+    func: NativeFunction
+
+    # The name of the generated autograd function.
+    # It's set only if we will calculate a derivative, i.e.
+    # 'args_with_derivatives' is not empty.
+    op: Optional[str]
+
+    # The derivatives formulae for this function.
+    # Note that the length of this sequence is the number of differentiable inputs
+    derivatives: Sequence[Derivative]
+
+    # The forward derivatives formulae for this function.
+    # Note that the length of this sequence is the number of differentiable outputs
+    forward_derivatives: Sequence[ForwardDerivative]
+
+    # The union of 'saved_inputs' of all 'derivatives'.
+    all_saved_inputs: Sequence[SavedAttribute]
+
+    # The union of 'saved_outputs' of all 'derivatives'.
+    all_saved_outputs: Sequence[SavedAttribute]
+
+    # All named gradients that are available for use, in the same
+    # order as in the grads vector.
+    available_named_gradients: Sequence[str]
+
+    # The named gradients that are used in any of the derivatives.
+    # Invariant: all(name in available_named_gradients for name in used_named_gradients)
+    used_named_gradients: Set[str]
+
+    # The function's input arguments for which it calculates derivatives.
+    # It's the union of 'var_names' of all 'derivatives', sorted by the
+    # argument order in the function schema.
+    args_with_derivatives: Sequence[Binding]
+
+    # Names of arguments whose derivative formula is 'non_differentiable'.
+    non_differentiable_arg_names: Sequence[str]
+
+    # Raw data read from derivatives.yaml.
+    output_differentiability: Optional[List[bool]]
+
+    # output_differentiability in derivatives.yaml can be a list of
+    # conditions that express if the output is differentiable. In this case,
+    # the number of conditions must match the number of outputs
+    # (NB: we only support one condition right now).
+    # output_differentiability gets populated with True for each condition,
+    # while output_differentiability_conditions gets populated with the conditions
+    output_differentiability_conditions: Optional[List[str]]
+
+    @property
+    def has_derivatives(self) -> bool:
+        return len(self.args_with_derivatives) > 0
+
+    # Generates a new DifferentiabilityInfo using the exact same set of derivative information,
+    # but with a new operator name.
+    # This is used when generating "copy" variants of view ops,
+    # which are able to use the exact same derivative formula as the original view op
+    # See Note [Codegen'd {view}_copy Operators]
+    def create_view_copy_from_view_derivative(
+        self, g: NativeFunctionsViewGroup
+    ) -> Optional["DifferentiabilityInfo"]:
+        if g.view_copy is None:
+            return None
+        f = g.view_copy
+
+        name_split_by_period = self.name.split(".", maxsplit=2)
+        # Append a "_copy" to the base name of the operator (but keep the overload name the same)
+        view_copy_name = f"{name_split_by_period[0]}_copy." + ".".join(
+            name_split_by_period[1:]
+        )
+        view_copy_op_name = None if self.op is None else f"{self.op}_copy"
+
+        return DifferentiabilityInfo(
+            # Use the "_copy" version of name/func/op
+            name=view_copy_name,
+            func=f,
+            op=view_copy_op_name,
+            # But keep all derivative info the same
+            derivatives=self.derivatives,
+            forward_derivatives=self.forward_derivatives,
+            all_saved_inputs=self.all_saved_inputs,
+            all_saved_outputs=self.all_saved_outputs,
+            available_named_gradients=self.available_named_gradients,
+            used_named_gradients=self.used_named_gradients,
+            args_with_derivatives=self.args_with_derivatives,
+            non_differentiable_arg_names=self.non_differentiable_arg_names,
+            output_differentiability=self.output_differentiability,
+            output_differentiability_conditions=self.output_differentiability_conditions,
+        )
+
+
+def uses_ident(info: Optional[DifferentiabilityInfo], ident: str) -> bool:
+    if info is None:
+        return False
+    for derivative in info.derivatives:
+        formula = derivative.formula
+        if re.search(IDENT_REGEX.format(ident), formula):
+            return True
+    return False
+
+
+def uses_retain_variables(info: Optional[DifferentiabilityInfo]) -> bool:
+    return uses_ident(info, "retain_variables")
+
+
+def uses_single_grad(info: Optional[DifferentiabilityInfo]) -> bool:
+    return uses_ident(info, "grad")
+
+
+# Represents a differentiable `Argument`.
+# How is it different from the `Argument` type?
+# - It's processed Arguments which are differentiable and only used in the
+#   context of the autograd codegen;
+# - It can represent SelfArgument or regular Argument but not TensorOptionsArgument;
+@dataclass(frozen=True)
+class DifferentiableInput:
+    name: str
+    type: Type
+
+    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
+    cpp_type: str
+
+
+# Represents a differentiable `Return`.
+# How it it different from the `Return` type?
+# - The name in `Return` is optional. Here it is always populated using the same
+#   `cpp.return_names()` method.
+#   TODO: some cpp naming logic (e.g. resolving name conflict) might be irrelevant?
+# - It's processed Returns which are differentiable, in compliance with the
+#   `output_differentiability` field defined in derivatives.yaml (if specified),
+#   and are only used in the context of the autograd codegen;
+@dataclass(frozen=True)
+class DifferentiableOutput:
+    name: str
+    type: Type
+
+    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
+    cpp_type: str
+
+
+@dataclass(frozen=True)
+class NativeFunctionWithDifferentiabilityInfo:
+    func: NativeFunction
+    info: Optional[Dict[str, DifferentiabilityInfo]]
+    fw_derivatives: Optional[Dict[str, Sequence[ForwardDerivative]]]
+
+
+# TODO: Update comment below since it is out of date.
+def dispatch_strategy(fn: NativeFunctionWithDifferentiabilityInfo) -> str:
+    """How are we going to call the underlying implementation of a
+    declaration?  There are two strategies:
+        - use_derived: we want to call the implementation on CPUDoubleType
+          (or a similar, derived Type instance).  Because these derived
+          instances deal in Tensors, not Variables (it's a completely different
+          object, so it doesn't dispatch back to VariableType), code on
+          this dispatch path needs to wrap/unwrap tensors.  If the
+          derived implementation takes and returns tensors, the
+          implementation is usually differentiable (although we also use
+          the derived dispatch path for non-differentiable functions
+          that we still want to dispatch on the derived Type instance;
+          e.g., size())
+        - use_type: we want to call the implementation on Type, because
+          it is implemented concretely, and the functions it invokes will
+          get dispatched back to VariableType (which will ensure that they
+          are differentiable.)
+    """
+    # fn is derived as long as any of its per-key differentiability infos
+    # has_derivatives. dispatch_strategy() is used to guard generation of fns in VariableType
+    # and ADInplaceOrViewType. We want to generate these functions as long as a
+    # derivative is defined for ANY dispatch key.
+    if fn.func.is_abstract or (
+        fn.info is not None and any(info.has_derivatives for info in fn.info.values())
+    ):
+        # If the function is abstract (not implemented on at::Type), we must
+        # call the implementation on the derived type with unpacked tensors.
+
+        # If the function has a derivative specified and is concrete, we could
+        # call either implementation. We prefer the calling the derived
+        # type's implementation with unpacked tensors because it is more
+        # performant in some cases: any internal calls to other ATen functions
+        # won't have the history tracked.
+
+        # If the function has a type dispatched argument (i.e. is a factory),
+        # we prefer calling the derived type's implementation both because it is
+        # more performant and to ensure factory functions return tensors with _version
+        # of 0 (probably not strictly necessary, but nice to have to keeps versions simple
+        # to understand.
+
+        return "use_derived"
+    else:
+        # If the function is concrete (we don't have to override it) and we
+        # didn't declare it in derivatives.yaml, we'll assume that it is
+        # actually implemented out of differentiable functions. (This
+        # assumption might not hold, but then you'll see gradcheck fail.)
+        return "use_type"
+
+
+def is_foreach_func(f: NativeFunction) -> bool:
+    return f.func.name.name.base.startswith("_foreach_")
+
+
+# note(crcrpar): Most foreach functions can reference an out-place `torch` function whose schema kind
+# is functional for their backward derivatives (and forward derivatives in the future), i.e.,
+# they would find such one in `functional_info_by_signature`. There however are some exceptions:
+_foreach_with_inplace_ref = {"_foreach_zero_"}
+_foreach_with_tensor_overload = {
+    "_foreach_add.Tensor",
+    "_foreach_mul.Tensor",
+    "_foreach_div.Tensor",
+}
+
+
+# Checks if `function_schema` is a native, non-foreach function which `f`, a foreach function
+# reference to generate derivatives.
+def is_reference_for_foreach(
+    f: NativeFunction,
+    function_schema: FunctionSchema,
+) -> bool:
+    return (
+        f.func.name.name.base.split("_foreach_")[-1] == function_schema.name.name.base
+        and (
+            not function_schema.name.name.inplace
+            or str(f.func.name) in _foreach_with_inplace_ref
+        )
+        and all(
+            ref_arg.type in (arg.type, getattr(arg.type, "elem", None))
+            for arg, ref_arg in zip(
+                f.func.arguments.flat_non_out,
+                function_schema.arguments.flat_non_out,
+            )
+        )
+    )
+
+
+# TODO(crcrpar): Avoid hard coding "Default" ideally.
+def gen_foreach_derivativeinfo(
+    foreach_function: NativeFunction,
+    functional_info_by_signature: Dict[
+        FunctionSchema, Dict[str, DifferentiabilityInfo]
+    ],
+    non_functional_info_by_signature: Dict[
+        FunctionSchema, Dict[str, DifferentiabilityInfo]
+    ],
+    dispatch_key: str = "Default",
+) -> Tuple[Optional[DifferentiabilityInfo], bool]:
+    """Generate DifferentiabilityInfo for out-place foreach function, return the existing one for in-place.
+
+    The second return value indicates whether the info is generated in this function.
+    """
+    ref_diff_info: Optional[DifferentiabilityInfo] = None
+
+    for function_schema, diff_info in functional_info_by_signature.items():
+        if not is_reference_for_foreach(foreach_function, function_schema):
+            continue
+        ref_diff_info = diff_info[dispatch_key]
+        if ref_diff_info is not None:
+            break
+    # note(crcrpar): It seems like `zero`'s info isn't available in functional_info_by_signature
+    # while the info of `zero_` is in non_functional_info_by_signature
+    if (
+        ref_diff_info is None
+        and foreach_function.func.kind() == SchemaKind.inplace
+        and str(foreach_function.func.name) in _foreach_with_inplace_ref
+    ):
+        for function_schema, diff_info in non_functional_info_by_signature.items():
+            if not is_reference_for_foreach(foreach_function, function_schema):
+                continue
+            ref_diff_info = diff_info[dispatch_key]
+            if ref_diff_info is not None:
+                break
+    if ref_diff_info is None:
+        return None, False
+
+    # non out-place uses the existing Derivative.
+    if foreach_function.func.kind() == SchemaKind.inplace:
+        return ref_diff_info, False
+
+    map_refarg2foreacharg, map_name2arg = {}, {}
+    for i, (arg, ref_arg) in enumerate(
+        zip(
+            foreach_function.func.arguments.flat_non_out,
+            function_schema.arguments.flat_non_out,
+        )
+    ):
+        map_refarg2foreacharg[ref_arg.name] = arg.name
+        map_name2arg[arg.name] = arg
+
+    all_saved_inputs, all_saved_outputs, all_var_names = [], [], []
+    modified_derivative_formulas = []
+    for i, derivative in enumerate(ref_diff_info.derivatives):
+        modified_formula = derivative.formula.replace("grad", "grads[i]").replace(
+            "result", "result[i]"
+        )
+        saved_inputs, saved_outputs = [], []
+        # note(crcrpar): This context seems necessary to call `cpp.argument_type`
+        with local.parametrize(
+            use_const_ref_for_mutable_tensors=foreach_function.use_const_ref_for_mutable_tensors,
+            use_ilistref_for_tensor_lists=foreach_function.part_of_structured_group,
+        ):
+            for ref_input in derivative.saved_inputs:
+                ref_input_jit_name = ref_input.expr.split(".")[0]
+                mapped_name = map_refarg2foreacharg[ref_input_jit_name]
+                if isinstance(map_name2arg[mapped_name].type, ListType):
+                    mapped_expr = mapped_name + "[i]"
+                else:
+                    mapped_expr = mapped_name
+                new_expr = ref_input.expr.replace(ref_input_jit_name, mapped_expr)
+                modified_formula = modified_formula.replace(
+                    cast(str, ref_input.nctype.name), new_expr
+                )
+
+                nctype = cpp.argument_type(map_name2arg[mapped_name], binds=mapped_name)
+                canonical_nctype = NamedCType(
+                    nctype.name, nctype.type.remove_const_ref()
+                )
+                saved_inputs.append(
+                    SavedAttribute(nctype=canonical_nctype, expr=mapped_name)
+                )
+            for ref_output in derivative.saved_outputs:
+                if ref_output.nctype.name == "result":
+                    saved_outputs.append(
+                        SavedAttribute(
+                            nctype=NamedCType(
+                                name="result", type=BaseCType(tensorListT)
+                            ),
+                            expr="result",
+                        )
+                    )
+                else:
+                    raise RuntimeError("")
+        var_names = [map_refarg2foreacharg[var] for var in derivative.var_names]
+        all_var_names.extend(var_names)
+        all_saved_inputs.extend(saved_inputs)
+        all_saved_outputs.extend(saved_outputs)
+        modified_derivative = Derivative(
+            formula=modified_formula,
+            original_formula=derivative.formula,
+            var_names=tuple(var_names),
+            saved_inputs=tuple(saved_inputs),
+            saved_outputs=tuple(saved_outputs),
+            named_gradients=set(),
+        )
+        modified_derivative_formulas.append(modified_derivative)
+
+    with local.parametrize(
+        use_const_ref_for_mutable_tensors=foreach_function.use_const_ref_for_mutable_tensors,
+        use_ilistref_for_tensor_lists=foreach_function.part_of_structured_group,
+    ):
+        args_with_derivatives = [
+            Binding(
+                name=arg.name,
+                nctype=cpp.argument_type(arg, binds=arg.name),
+                argument=arg,
+                default=None,
+            )
+            for arg in foreach_function.func.arguments.flat_non_out
+            if arg.name in all_var_names
+        ]
+
+    forward_derivatives: List[ForwardDerivative] = []
+    fw_derivative: ForwardDerivative
+    for fw_derivative in ref_diff_info.forward_derivatives:
+        var_names: List[str] = list(fw_derivative.var_names)  # type: ignore[no-redef]
+        var_types: List[Type] = list(fw_derivative.var_types)
+        required_inputs_fw_grad: List[str] = []
+        required_inputs_primal: List[str] = []
+        if fw_derivative.required_inputs_fw_grad is not None:
+            required_inputs_fw_grad = list(fw_derivative.required_inputs_fw_grad)
+        if fw_derivative.required_inputs_primal:
+            required_inputs_primal = list(fw_derivative.required_inputs_primal)
+        modified_formula = fw_derivative.formula
+
+        # Foreach's result is TensorList
+        if "result" in modified_formula:
+            modified_formula = fw_derivative.formula.replace("result", "result[i]")
+
+        for foreach_arg, ref_arg in zip(
+            foreach_function.func.arguments.flat_non_out,
+            ref_diff_info.func.func.arguments.flat_non_out,
+        ):
+            # Modify reference forward formula
+            if (
+                isinstance(foreach_arg.type, ListType)
+                and not foreach_arg.type.is_tensor_like()
+            ):
+                # Assuming ScalarList
+                modified_formula = modified_formula.replace(
+                    ref_arg.name, foreach_arg.name + "[i]"
+                )
+            elif foreach_arg.type.is_tensor_like():
+                # Assuming TensorList / Tensor
+                # assert isinstance(foreach_arg.type, ListType), f"{foreach_function.func.name}, {foreach_arg.type}"
+                assert isinstance(foreach_arg.type, ListType) or (
+                    foreach_arg.type == BaseType(BaseTy.Tensor)
+                    and str(foreach_function.func.name) in _foreach_with_tensor_overload
+                ), f"{foreach_function.func.name}, {foreach_arg.type}"
+                for suffix in ("_p", "_t"):
+                    curr_expr = ref_arg.name + suffix
+                    if curr_expr in modified_formula:
+                        new_expr = foreach_arg.name + suffix
+                        modified_formula = modified_formula.replace(curr_expr, new_expr)
+            else:
+                # Assuming Scalar
+                if foreach_arg.name != ref_arg.name:
+                    modified_formula = modified_formula.replace(
+                        ref_arg.name, foreach_arg.name
+                    )
+
+            # note(crcrpar): there should exist a cooler way...
+            for i, name in enumerate(var_names):
+                if name == ref_arg.name:
+                    var_names[i] = foreach_arg.name
+                    var_types[i] = foreach_arg.type
+            for i, name in enumerate(required_inputs_fw_grad):
+                if name == ref_arg.name:
+                    required_inputs_fw_grad[i] = foreach_arg.name
+            for i, name in enumerate(required_inputs_primal):
+                if name == ref_arg.name:
+                    required_inputs_primal[i] = foreach_arg.name
+        forward_derivatives.append(
+            ForwardDerivative(
+                formula=modified_formula,
+                var_names=tuple(var_names),
+                var_types=tuple(var_types),
+                required_inputs_fw_grad=tuple(required_inputs_fw_grad),
+                required_inputs_primal=tuple(required_inputs_primal),
+                required_original_self_value=fw_derivative.required_original_self_value,
+                is_reusing_outplace_formula=fw_derivative.is_reusing_outplace_formula,
+            )
+        )
+
+    return (
+        DifferentiabilityInfo(
+            name=foreach_function.func.name.name.base,
+            func=foreach_function,
+            op=f"Foreach{ref_diff_info.op}{foreach_function.func.name.overload_name}",
+            derivatives=modified_derivative_formulas,
+            forward_derivatives=forward_derivatives,
+            all_saved_inputs=tuple(set(all_saved_inputs)),
+            all_saved_outputs=tuple(set(all_saved_outputs)),
+            available_named_gradients=(),
+            used_named_gradients=set(),
+            args_with_derivatives=args_with_derivatives,
+            non_differentiable_arg_names=[],
+            output_differentiability=None,
+            output_differentiability_conditions=None,
+        ),
+        True,
+    )
+
+
+def match_differentiability_info(
+    native_functions: List[NativeFunction],
+    differentiability_infos: Dict[FunctionSchema, Dict[str, DifferentiabilityInfo]],
+) -> List[NativeFunctionWithDifferentiabilityInfo]:
+    """Sets the "derivative" key on declarations to matching autograd function
+    In-place functions will use the out-of-place derivative definition if there
+    is no in-place specific derivative.
+    """
+
+    functional_info_by_signature = {
+        schema.signature(strip_default=True): info_dict
+        for schema, info_dict in differentiability_infos.items()
+        if schema.kind() == SchemaKind.functional
+    }
+    non_functional_info_by_signature = {
+        schema.signature(strip_default=True): info_dict
+        for schema, info_dict in differentiability_infos.items()
+        if schema.kind() != SchemaKind.functional
+    }
+
+    def find_info(
+        f: NativeFunction,
+    ) -> Tuple[Optional[Dict[str, DifferentiabilityInfo]], bool]:
+        # Don't bother matching info to generated out= variants
+        if "generated" in f.tags and f.func.kind() == SchemaKind.out:
+            return None, False
+
+        # (1) Check for an exact match
+        if f.func in differentiability_infos:
+            return differentiability_infos[f.func], True
+
+        # (2) If no exact match, check if the out-of-place variant
+        # of this operator has a match.
+        # i.e mul() for mul_() or mul_out()
+        # note(crcrpar): Check foreach or not because in-place foreach functions use backward defined for the existing
+        # native functions instead of the out-place counterparts.
+        f_sig = f.func.signature(strip_default=True)
+        if f_sig in functional_info_by_signature and not is_foreach_func(f):
+            return functional_info_by_signature[f_sig], False
+
+        # (3) Some operators have a derivative explicitly defined for the mutable
+        # variant, but get a code-generated out-of-place variant which does *not*
+        # come with a derivative formula.
+        # For the generated out-of-place variant, use the mutable variant's formula
+        # if it exists.
+        if "generated" in f.tags and f_sig in non_functional_info_by_signature:
+            info_dict = non_functional_info_by_signature[f_sig]
+            # See https://github.com/pytorch/pytorch/pull/76320/files#r874816389
+            assert not any(
+                any("self" in str(inpt.nctype.name) for inpt in info.all_saved_inputs)
+                for info in info_dict.values()
+            ), f"""\
+Attempted to convert a derivative formula for a mutable operator
+ to be used by automatically by its functional variant ("{str(f.func)}").
+ this is not currently supported (we'd need to fix up the formula in the codegen)."""
+            return info_dict, False
+
+        # (4) Generate derivative information of foreach functions if none is defined in `derivatives.yaml`
+        if is_foreach_func(f):
+            assert f.func not in differentiability_infos
+            diff_info, is_generated = gen_foreach_derivativeinfo(
+                f,
+                functional_info_by_signature,
+                non_functional_info_by_signature,
+            )
+            if diff_info is None:
+                return None, False
+            # TODO(crcrpar): Avoid hard coding "Default" ideally.
+            diff_info_dict = {"Default": diff_info}
+            if is_generated:
+                differentiability_infos[f.func] = diff_info_dict
+                functional_info_by_signature[f.func] = diff_info_dict
+            return diff_info_dict, is_generated
+
+        return None, False
+
+    result: List[NativeFunctionWithDifferentiabilityInfo] = []
+    for f in native_functions:
+        info_dict, is_exact_match = find_info(f)
+
+        # Currently, the '.strides()' to 'strides_or_error' replacement does not support
+        # 'self' derivatives of an inplace function, so we must check for this case.
+        if f.func.kind() == SchemaKind.inplace and (info_dict is not None):
+            for info in info_dict.values():
+                for derivative in info.derivatives:
+                    if "self" in derivative.var_names:
+                        for saved_input in derivative.saved_inputs:
+                            assert "strides_or_error" not in saved_input.expr, (
+                                "Calling '.strides()' in the 'self' derivative formula of an "
+                                f"in-place function is not supported: {f.func}"
+                            )
+
+        if not info_dict:
+            result.append(
+                NativeFunctionWithDifferentiabilityInfo(
+                    func=f, info=None, fw_derivatives=None
+                )
+            )
+            continue
+
+        fw_derivative_dict: Dict[str, Sequence[ForwardDerivative]] = {}
+        for key, info in info_dict.items():
+            if not info.forward_derivatives:
+                fw_derivative_dict[key] = []
+                continue
+
+            forward_derivatives = info.forward_derivatives
+
+            # For functions that have a single def for out-of-place and inplace (like abs())
+            if f.func.kind() == SchemaKind.inplace:
+                # For inplace functions there is a little bit of work to do:
+                #  1) Validate the formula and make sure the input that is modified in not used:
+                #    - If there is a formula for the inplace variant of the function (is_exact_match == True) then
+                #      we make sure that the original value of the input that is being modified inplace (self_p) is
+                #      not used in the formula. Note that the formula can use "original_self_p" here and that would
+                #      trigger a clone of the original input.
+                #    - If we are re-using the out of place formula (is_exact_match == False) then we replace every
+                #      occurrence of self_p and self_t by original_self_p and original_self_t. These will be
+                #      populated by cloned version of the original input (either the clone done by the backward AD
+                #      logic if self is also used in a backward formula or a special clone that we add).
+                #  2) At this point, there cannot be a self_p in the formula.
+                #  3) Change "result" into "self_p" as by design, in the inplace function codegen, the result is
+                #     simply called self (as it is modified inplace).
+                #  4) Update the required primals data in case it used to contain "result" but should now contain
+                #     "self"
+                #  5) If it is not an exact match, the user formula is not modifying the existing forward grad
+                #     inplace as it should. So add some code that makes sure that we do so if the forward grad
+                #     already exists.
+
+                assert (
+                    len(info.forward_derivatives) == 1
+                )  # Only single output inplace should exist
+                fw_info = info.forward_derivatives[0]
+                formula = fw_info.formula
+
+                def replace_self_with_original_self(formula: str, postfix: str) -> str:
+                    def repl(m: Match[str]) -> str:
+                        return f"{m.group(1)}original_self{postfix}{m.group(2)}"
+
+                    return re.sub(IDENT_REGEX.format(f"self{postfix}"), repl, formula)
+
+                if re.search(IDENT_REGEX.format("self_p"), formula):
+                    if is_exact_match:
+                        # For manually defined formulas, don't allow the original value to be used
+                        raise RuntimeError(
+                            f'The formula for "{f.func.name}" is using the original value of self '
+                            "that is being modified inplace. This would lead to wrong forward gradients. "
+                            'Please use "result" in the formula only.'
+                        )
+                    else:
+                        # When the original formula is out of place, we save a clone of the primal
+                        # value to be able to access this value if needed
+                        # replace "self_p"/"self_t" from the formula by "original_self_p"/"original_self_t"
+                        formula = replace_self_with_original_self(formula, "_p")
+                        formula = replace_self_with_original_self(formula, "_t")
+
+                # replace "result" from the formula by "self_p"
+                def repl(m: Match[str]) -> str:
+                    return f"{m.group(1)}self_p{m.group(2)}"
+
+                formula = re.sub(IDENT_REGEX.format("result"), repl, formula)
+
+                required_primals = fw_info.required_inputs_primal
+                if re.search(IDENT_REGEX.format("self_p"), formula):
+                    required_primals = (
+                        required_primals + ("self",) if required_primals else ("self",)
+                    )
+
+                if not is_exact_match:
+                    # NOTE [In-place forward AD formula Optimization]
+                    #
+                    # This optimization transforms the formula to directly do inplace, i.e.
+                    # instead of self_t.copy_(self_t.op()) we do self_t.op_() when the following are met:
+                    #
+                    # 1) the formula satisfies the pattern: "self_t.op(*args)"
+                    # 2) "op" in (1) needs to be the same as the op the derivative is for
+                    #
+                    # (2) may seem too strict, but currently the only ops that satisfy (1) also satisfy (2)
+                    # If there is a need, we can relax (2) to allow any op that has an in-place variant
+                    is_single_method_on_self_t = False
+                    directly_do_inplace = False
+                    op_name: Optional[str] = None
+                    between_parens: Optional[str] = None
+                    match = re.fullmatch(r"self_t.([\w]*)\((.*)\)", formula)
+                    if match:
+                        op_name, between_parens = match.group(1), match.group(2)
+
+                        # We want to...
+                        #   Match: self_t.op1(other_p.op2(arg))
+                        #   Avoid: self_t.op1(args) + self_t.op2(args)
+                        #   Avoid: self_t.op1(other_p.op2(arg)) + self_t.op2(args)
+                        def check_parens_nest_level_gt_zero(s: str) -> bool:
+                            level = 1
+                            for ch in s:
+                                if ch == ")":
+                                    level -= 1
+                                    if level == 0:
+                                        return False
+                                if ch == "(":
+                                    level += 1
+                            return True
+
+                        is_single_method_on_self_t = check_parens_nest_level_gt_zero(
+                            between_parens
+                        )
+                        directly_do_inplace = (
+                            is_single_method_on_self_t and op_name == info.name
+                        )
+
+                    if directly_do_inplace:
+                        assert op_name is not None
+                        assert between_parens is not None
+                        formula = f"self_t_raw.defined() ? self_t_raw.{op_name}_({between_parens}) : {formula}"
+                    else:
+                        # Make sure that the forward grad is modified inplace when the original formula
+                        # is out of place
+                        formula = f"self_t_raw.defined() ? self_t_raw.copy_({formula}) : {formula}"
+
+                required_original_self_value = bool(
+                    re.search(IDENT_REGEX.format("original_self_p"), formula)
+                ) or bool(re.search(IDENT_REGEX.format("original_self_t"), formula))
+
+                forward_derivatives = [
+                    ForwardDerivative(
+                        formula=formula,
+                        var_names=("self",),
+                        var_types=fw_info.var_types,
+                        required_inputs_fw_grad=fw_info.required_inputs_fw_grad,
+                        required_inputs_primal=required_primals,
+                        required_original_self_value=required_original_self_value,
+                        is_reusing_outplace_formula=not is_exact_match,
+                    ),
+                ]
+
+            fw_derivative_dict[key] = forward_derivatives
+
+        result.append(
+            NativeFunctionWithDifferentiabilityInfo(
+                func=f, info=info_dict, fw_derivatives=fw_derivative_dict
+            )
+        )
+
+    return result
+
+
+def is_differentiable(
+    name: str, type: Type, info: Optional[DifferentiabilityInfo]
+) -> bool:
+    return type.is_tensor_like() and (
+        info is None or name not in info.non_differentiable_arg_names
+    )
+
+
+def gen_differentiable_outputs(
+    fn: NativeFunctionWithDifferentiabilityInfo, key: str = "Default"
+) -> List[DifferentiableOutput]:
+    f = fn.func
+    info = fn.info[key] if fn.info else None
+    outputs: List[DifferentiableOutput] = [
+        DifferentiableOutput(
+            name=name,
+            type=ret.type,
+            cpp_type=cpp.return_type(ret, symint=True).cpp_type(),
+        )
+        for name, ret in zip(cpp.return_names(f), f.func.returns)
+    ]
+    output_differentiability = info.output_differentiability if info else None
+    if output_differentiability is not None:
+        if len(output_differentiability) != len(outputs):
+            raise RuntimeError(
+                f"The length of output_differentiability ({len(output_differentiability)}), "
+                f"does not match the number of outputs ({len(outputs)})."
+            )
+        differentiable_outputs: List[DifferentiableOutput] = []
+        if False in output_differentiability and f.func.kind() == SchemaKind.inplace:
+            raise RuntimeError(
+                "output_differentiability=False for inplace operation (version_counter won't get updated)"
+            )
+        for differentiable, output in zip(output_differentiability, outputs):
+            if differentiable:
+                differentiable_outputs.append(output)
+        return differentiable_outputs
+    candidate_differentiable_outputs = list(
+        filter(lambda r: is_differentiable(r.name, r.type, info), outputs)
+    )
+    if uses_single_grad(info):
+        return candidate_differentiable_outputs[:1]
+    else:
+        return candidate_differentiable_outputs
diff --git a/MLPY/Lib/site-packages/torchgen/api/cpp.py b/MLPY/Lib/site-packages/torchgen/api/cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccb3f28170295b883e2b36ddadf5e638d3a6ba8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/cpp.py
@@ -0,0 +1,467 @@
+from typing import List, Optional, Sequence, Set, Union
+
+from torchgen import local
+from torchgen.api.types import (
+    ArgName,
+    ArrayCType,
+    ArrayRefCType,
+    BaseCType,
+    BaseTypeToCppMapping,
+    Binding,
+    boolT,
+    ConstRefCType,
+    CType,
+    dimnameListT,
+    intArrayRefT,
+    iTensorListRefT,
+    ListCType,
+    longT,
+    MutRefCType,
+    NamedCType,
+    OptionalCType,
+    optionalIntArrayRefT,
+    optionalSymIntArrayRefT,
+    scalarT,
+    SpecialArgName,
+    symIntArrayRefT,
+    SymIntT,
+    tensorListT,
+    tensorOptionsT,
+    tensorT,
+    TupleCType,
+    VectorCType,
+    voidT,
+)
+from torchgen.model import (
+    Argument,
+    Arguments,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import assert_never
+
+# This file describes the translation of JIT schema to the public C++
+# API, which is what people use when they call functions like at::add.
+#
+# Prominent characteristics of the C++ API:
+#
+#   - dtype, layout, device and pin_memory are collected into
+#     a single C++ type TensorOptions  (the native functions API
+#     also has this, but tensor options is really most relevant
+#     for the C++ API; it makes calling kwarg factory functions
+#     pleasant)
+#
+#   - defaulting lives here (in fact, the dispatcher is completely
+#     oblivious of defaults!)
+#
+# BTW: policy on name collisions: we try not to have types with
+# collisions, but functions are fair game to collide
+
+
+def name(
+    func: FunctionSchema,
+    *,
+    faithful_name_for_out_overloads: bool = False,
+    symint_overload: bool = False,
+) -> str:
+    name = str(func.name.name)
+    if symint_overload:
+        name += "_symint"
+    if func.is_out_fn():
+        if faithful_name_for_out_overloads:
+            name += "_outf"
+        else:
+            name += "_out"
+
+    return name
+
+
+# Translation of "value types" in JIT schema to C++ API type.  Value
+# types look the same no matter if they are argument types or return
+# types.  Returns None if the type in question is not a value type.
+def valuetype_type(
+    t: Type,
+    *,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+    symint: bool = False,
+) -> Optional[NamedCType]:
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor or t.name == BaseTy.Scalar:
+            return None
+        elif str(t) == "SymInt":
+            if symint:
+                return NamedCType(binds, BaseCType(SymIntT))
+            else:
+                return NamedCType(binds, BaseCType(longT))
+        if remove_non_owning_ref_types:
+            if t.name == BaseTy.str:
+                raise AssertionError(
+                    "string ref->value conversion: not implemented yet"
+                )
+        # All other BaseType currently map directly to BaseCppTypes.
+        return NamedCType(binds, BaseCType(BaseTypeToCppMapping[t.name]))
+    elif isinstance(t, OptionalType):
+        elem = valuetype_type(t.elem, binds=binds, symint=symint)
+        if elem is None:
+            return None
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        if str(t.elem) == "bool":
+            assert t.size is not None
+            return NamedCType(binds, ArrayCType(BaseCType(boolT), t.size))
+        else:
+            return None
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translation of types occurring in JIT arguments to a C++ argument type.
+# If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type.
+# For example, we'll return std::vector<int> instead of IntArrayRef.
+# See Note [translation from C++ reference to value types]
+def argumenttype_type(
+    t: Type,
+    *,
+    mutable: bool,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+    symint: bool = False,
+) -> NamedCType:
+    # If it's a value type, do the value type translation
+    r = valuetype_type(
+        t,
+        binds=binds,
+        symint=symint,
+        remove_non_owning_ref_types=remove_non_owning_ref_types,
+    )
+    if r is not None:
+        return r
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(binds, MutRefCType(BaseCType(tensorT)))
+            else:
+                return NamedCType(binds, ConstRefCType(BaseCType(tensorT)))
+        elif t.name == BaseTy.Scalar:
+            return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+        else:
+            raise AssertionError(f"base type should have been value type {t}")
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == "Tensor":
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(
+                    binds, MutRefCType(BaseCType(tensorT))
+                )  # TODO: fix this discrepancy
+            else:
+                return NamedCType(
+                    binds, ConstRefCType(OptionalCType(BaseCType(tensorT)))
+                )
+        elif str(t.elem) == "Scalar":
+            return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
+        elif isinstance(t.elem, ListType) and str(t.elem.elem) == "int":
+            return NamedCType(binds, BaseCType(optionalIntArrayRefT))
+        elif isinstance(t.elem, ListType) and str(t.elem.elem) == "SymInt":
+            if symint:
+                return NamedCType(binds, BaseCType(optionalSymIntArrayRefT))
+            else:
+                return NamedCType(binds, BaseCType(optionalIntArrayRefT))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds, symint=symint)
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        # TODO: remove these special cases, ArrayRef fallthrough works fine
+        if str(t.elem) == "int":
+            if remove_non_owning_ref_types:
+                return NamedCType(binds, VectorCType(BaseCType(longT)))
+            else:
+                return NamedCType(binds, BaseCType(intArrayRefT))
+        if str(t.elem) == "SymInt":
+            if remove_non_owning_ref_types:
+                if symint:
+                    return NamedCType(binds, VectorCType(BaseCType(SymIntT)))
+                else:
+                    return NamedCType(binds, VectorCType(BaseCType(longT)))
+            else:
+                if symint:
+                    return NamedCType(binds, BaseCType(symIntArrayRefT))
+                else:
+                    return NamedCType(binds, BaseCType(intArrayRefT))
+        if str(t.elem) == "Tensor":
+            if local.use_ilistref_for_tensor_lists():
+                return NamedCType(binds, ConstRefCType(BaseCType(iTensorListRefT)))
+            else:
+                return NamedCType(binds, BaseCType(tensorListT))
+        elif str(t.elem) == "Scalar":
+            return NamedCType(binds, ArrayRefCType(BaseCType(scalarT)))
+        elif str(t.elem) == "Dimname":
+            return NamedCType(binds, BaseCType(dimnameListT))
+        elif str(t.elem) == "Tensor?":
+            return NamedCType(
+                binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT))))
+            )
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds, symint=symint)
+        return NamedCType(binds, ArrayRefCType(elem.type))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translate a JIT argument into its C++ type
+def argument_type(a: Argument, *, binds: ArgName, symint: bool = False) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, symint=symint, binds=binds)
+
+
+# Translation of a (non-multi) return type from JIT to C++
+# N.B: returntype_type returns a CType, not a NamedCType.
+# This is mostly because of the mismatch between return types and return names.
+# e.g. a function with a return type of 'void' has 0 return names,
+# and a function with a return type of 'std::tuple' has >1 return name.
+def returntype_type(t: Type, *, mutable: bool, symint: bool = False) -> CType:
+    # placeholder is ignored
+    # NB: symint is ALWAYS respected for return types.  So symint argument
+    # here is IGNORED
+    r = valuetype_type(t, binds="__placeholder__", symint=True)
+    if r is not None:
+        return r.type
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable:
+                if local.use_const_ref_for_mutable_tensors():
+                    return ConstRefCType(BaseCType(tensorT))
+                else:
+                    return MutRefCType(BaseCType(tensorT))
+            else:
+                # Note [Tensor Copy Returns]
+                # Currently, we use "Argument.is_write" to determine
+                # whether or not Tensor return types should be copies or references.
+                # If that ever changes, take a look at other locations of this note!
+                return BaseCType(tensorT)
+        elif t.name == BaseTy.Scalar:
+            return BaseCType(scalarT)
+    elif isinstance(t, ListType):
+        assert (
+            not mutable
+        ), "Native functions should never return a mutable tensor list. They should return void."
+        elem = returntype_type(t.elem, mutable=False)
+        assert t.size is None, f"fixed size list returns not supported: {t}"
+        return VectorCType(elem)
+    elif isinstance(t, OptionalType):
+        elem = returntype_type(t.elem, mutable=mutable)
+        if str(t.elem) == "Tensor":
+            return OptionalCType(elem)
+
+    raise AssertionError(f"unrecognized return type {t}")
+
+
+# Translation of a single return to its C++ type
+def return_type(r: Return, *, symint: bool = False) -> CType:
+    return returntype_type(r.type, mutable=r.is_write, symint=symint)
+
+
+# Translation of a full (possibly multi) return from JIT to its C++ type
+def returns_type(rs: Sequence[Return], *, symint: bool = False) -> CType:
+    if len(rs) == 0:
+        return BaseCType(voidT)
+    elif len(rs) == 1:
+        return return_type(rs[0], symint=symint)
+    else:
+        return TupleCType([return_type(r, symint=symint) for r in rs])
+
+
+def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequence[str]:
+    returns: List[str] = []
+    for i, r in enumerate(f.func.returns):
+        # If we have an inplace function, the return argument is
+        # implicitly named self.
+        # TODO: Consider incorporating this into the data model
+        if f.func.name.name.inplace:
+            assert i == 0, "illegal inplace function with multiple returns"
+            name = "self"
+        # If we are out function, the name is the name of the
+        # corresponding output function (r.name will get recorded
+        # in field_name later.)
+        elif f.func.is_out_fn():
+            name = f.func.arguments.out[i].name
+        # If the return argument is explicitly named...
+        elif r.name:
+            name_conflict = any(
+                r.name == a.name for a in f.func.schema_order_arguments()
+            )
+            if name_conflict and not f.func.is_out_fn():
+                name = f"{r.name}_return"
+            else:
+                name = r.name
+        # If there is no explicit name and no fallback name was passed in, we just name the output result,
+        # unless it's a multi-return, in which case it's result0,
+        # result1, etc (zero-indexed)
+        else:
+            name = fallback_name if len(f.func.returns) == 1 else f"{fallback_name}{i}"
+        returns.append(name)
+    return returns
+
+
+JIT_TO_CPP_DEFAULT = {
+    "False": "false",
+    "True": "true",
+    "None": "c10::nullopt",  # UGH this one is type directed
+    "Mean": "at::Reduction::Mean",
+    "[]": "{}",
+    "contiguous_format": "MemoryFormat::Contiguous",
+    "long": "at::kLong",
+}
+
+
+# Convert a JIT default into C++ expression representing the default
+def default_expr(d: str, t: Type, *, symint: bool) -> str:
+    if d == "None" and str(t) == "Tensor?":
+        return "{}"
+    if isinstance(t, BaseType) and t.name is BaseTy.str:
+        # Schema allows single quotes but C++ needs double
+        if len(d) >= 2 and d[0] == "'" and d[-1] == "'":
+            s = ""
+            i = 1
+            while i + 1 < len(d):
+                if d[i] != "\\":
+                    if d[i] == '"':
+                        s += '\\"'
+                    else:
+                        s += d[i]
+                    i += 1
+                else:
+                    if d[i + 1] == "'":
+                        s += "'"
+                    else:
+                        s += d[i : i + 2]
+                    i += 2
+
+            return f'"{s}"'
+
+    if isinstance(t, OptionalType):
+        if d == "None":
+            return "c10::nullopt"
+
+        return default_expr(d, t.elem, symint=symint)
+
+    if isinstance(t, ListType):
+        if d.startswith("[") and d.endswith("]"):
+            return "{" + d[1:-1] + "}"
+        elif symint and d.isdigit() and str(t.elem) == "SymInt":
+            return f"c10::SymInt({d})"
+        elif t.size is None:
+            # NOTE: Sized lists can have scalar defaults
+            raise ValueError(f"Expected a list default '[...]' but found: '{d}'")
+
+    return JIT_TO_CPP_DEFAULT.get(d, d)
+
+
+# Convert an argument into its C++ API form
+
+
+def argument(
+    a: Union[Argument, TensorOptionsArguments, SelfArgument],
+    *,
+    cpp_no_default_args: Set[str],
+    method: bool,
+    faithful: bool,
+    symint: bool = False,
+    has_tensor_options: bool,
+) -> List[Binding]:
+    def sub_argument(
+        a: Union[Argument, TensorOptionsArguments, SelfArgument]
+    ) -> List[Binding]:
+        return argument(
+            a,
+            cpp_no_default_args=cpp_no_default_args,
+            method=method,
+            faithful=faithful,
+            symint=symint,
+            has_tensor_options=has_tensor_options,
+        )
+
+    if isinstance(a, Argument):
+        binds: ArgName
+        if a.name == "memory_format" and has_tensor_options:
+            binds = SpecialArgName.possibly_redundant_memory_format
+        else:
+            binds = a.name
+        default: Optional[str] = None
+        if a.name not in cpp_no_default_args and a.default is not None:
+            default = default_expr(a.default, a.type, symint=symint)
+        return [
+            Binding(
+                nctype=argument_type(a, binds=binds, symint=symint),
+                name=a.name,
+                default=default,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, TensorOptionsArguments):
+        if faithful:
+            return (
+                sub_argument(a.dtype)
+                + sub_argument(a.layout)
+                + sub_argument(a.device)
+                + sub_argument(a.pin_memory)
+            )
+        else:
+            default = None
+            # Enforced by NativeFunction.__post_init__
+            assert "options" not in cpp_no_default_args
+            if all(x.default == "None" for x in a.all()):
+                default = "{}"
+            elif a.dtype.default == "long":
+                default = "at::kLong"  # TODO: this is wrong
+            return [
+                Binding(
+                    nctype=NamedCType("options", BaseCType(tensorOptionsT)),
+                    name="options",
+                    default=default,
+                    argument=a,
+                )
+            ]
+    elif isinstance(a, SelfArgument):
+        if method:
+            # Caller is responsible for installing implicit this in context!
+            return []
+        else:
+            return sub_argument(a.argument)
+    else:
+        assert_never(a)
+
+
+def arguments(
+    arguments: Arguments,
+    *,
+    faithful: bool,
+    symint: bool = False,
+    method: bool,
+    cpp_no_default_args: Set[str],
+) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    if faithful:
+        args.extend(arguments.non_out)
+        args.extend(arguments.out)
+    else:
+        args.extend(arguments.out)
+        args.extend(arguments.non_out)
+    return [
+        r.no_default() if faithful else r
+        for a in args
+        for r in argument(
+            a,
+            faithful=faithful,
+            symint=symint,
+            method=method,
+            has_tensor_options=arguments.tensor_options is not None,
+            cpp_no_default_args=cpp_no_default_args,
+        )
+    ]
diff --git a/MLPY/Lib/site-packages/torchgen/api/dispatcher.py b/MLPY/Lib/site-packages/torchgen/api/dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..15f059732893d92d633d58c84fb5c2a5282028a8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/dispatcher.py
@@ -0,0 +1,118 @@
+import itertools
+from typing import List, Sequence, Union
+
+from torchgen.api import cpp
+
+from torchgen.api.types import ArgName, Binding, CType, NamedCType
+from torchgen.model import (
+    Argument,
+    FunctionSchema,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import assert_never, concatMap
+
+# This file describes the translation of JIT schema to the dispatcher
+# API, the *unboxed* calling convention by which invocations through
+# the dispatcher are made.  Historically, the dispatcher API matched
+# the C++ API, but with the establishment of the boxed API, we've
+# made changes to the dispatcher API to so that the unboxed API
+# better aligns with the boxed API.  The dispatcher API hooks heavily
+# into our template based boxing/unboxing machinery, so changes
+# to this convention will usually need template updates too.
+#
+# Prominent characteristics of the dispatcher API:
+#
+#   - dtype, layout, device and pin_memory are represented as separate
+#     arguments.
+#
+
+
+def name(func: FunctionSchema) -> str:
+    return cpp.name(func)
+
+
+def argumenttype_type(
+    t: Type,
+    *,
+    mutable: bool,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+    symint: bool = True,
+) -> NamedCType:
+    # This is a faux amis.  If it makes sense in the future to add
+    # more special cases here, or invert things so cpp.argument_type
+    # calls this, or just completely inline the function, please do
+    # it.
+    return cpp.argumenttype_type(
+        t,
+        mutable=mutable,
+        binds=binds,
+        symint=symint,
+        remove_non_owning_ref_types=remove_non_owning_ref_types,
+    )
+
+
+def argument_type(
+    a: Argument,
+    *,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+    symint: bool = True,
+) -> NamedCType:
+    return argumenttype_type(
+        a.type,
+        mutable=a.is_write,
+        binds=binds,
+        remove_non_owning_ref_types=remove_non_owning_ref_types,
+        symint=symint,
+    )
+
+
+def returns_type(rs: Sequence[Return], *, symint: bool = True) -> CType:
+    # At present, there is no difference. But there could be!
+    return cpp.returns_type(rs, symint=symint)
+
+
+def jit_arguments(func: FunctionSchema) -> List[Argument]:
+    def to_argument(
+        a: Union[Argument, TensorOptionsArguments, SelfArgument]
+    ) -> List[Argument]:
+        if isinstance(a, Argument):
+            return [a]
+        elif isinstance(a, SelfArgument):
+            return [a.argument]
+        elif isinstance(a, TensorOptionsArguments):
+            return [a.dtype, a.layout, a.device, a.pin_memory]
+        else:
+            assert_never(a)
+
+    return list(
+        concatMap(
+            to_argument,
+            itertools.chain(
+                func.arguments.positional, func.arguments.kwarg_only, func.arguments.out
+            ),
+        )
+    )
+
+
+def argument(
+    a: Argument, *, remove_non_owning_ref_types: bool = False, symint: bool = True
+) -> Binding:
+    return Binding(
+        nctype=argument_type(
+            a,
+            binds=a.name,
+            remove_non_owning_ref_types=remove_non_owning_ref_types,
+            symint=symint,
+        ),
+        name=a.name,
+        argument=a,
+    )
+
+
+def arguments(func: FunctionSchema, *, symint: bool = True) -> List[Binding]:
+    return [argument(a, symint=symint) for a in jit_arguments(func)]
diff --git a/MLPY/Lib/site-packages/torchgen/api/functionalization.py b/MLPY/Lib/site-packages/torchgen/api/functionalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a31e99ea2e6596e01721e2fa0b63866648ac310
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/functionalization.py
@@ -0,0 +1,199 @@
+from typing import List, Optional
+
+from torchgen.api import dispatcher
+from torchgen.api.types import (
+    BaseCppType,
+    BaseCType,
+    Binding,
+    boolT,
+    ConstRefCType,
+    CType,
+    longT,
+    NamedCType,
+    tensorT,
+)
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    NativeFunction,
+    NativeFunctionsViewGroup,
+)
+
+
+# This file describes the translation of JIT schema to API's used
+# when creating view lambdas that are used by the functionalization pass.
+# There are two types of lambdas: forward lambdas and reverse lambdas.
+# These API's mostly follow the dispatcher API, with a few quirks:
+# - The lambda capture has to convert reference types to value types
+# - While the forward lambda just directly calls into the at::_ops API
+#   (following the dispatcher convention), the logic here for the reverse lambda
+#   is responsible for generating both the call-site, and the declarations
+#   (which are implemented manually in the at::functionalization::impl namespace).
+
+# The lambdas generated for each view op in the functionalization pass are of the form
+# [capture_arguments](outer_arguments) -> returns_type {
+#     return name(inner_arguments);
+# }
+
+# Define some specific lambda input arguments.
+base_binding = Binding(
+    name="base",
+    nctype=NamedCType(name="base", type=ConstRefCType(BaseCType(tensorT))),
+    argument=Argument(
+        name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None
+    ),
+    default=None,
+)
+mutated_view_binding = Binding(
+    name="mutated_view",
+    nctype=NamedCType(name="mutated_view", type=ConstRefCType(BaseCType(tensorT))),
+    argument=Argument(
+        name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None
+    ),
+    default=None,
+)
+mutated_view_idx_binding = Binding(
+    name="mutated_view_idx",
+    nctype=NamedCType(name="mutated_view_idx", type=BaseCType(longT)),
+    argument=Argument(
+        name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None
+    ),
+    default=None,
+)
+reapply_views_binding = Binding(
+    name="reapply_views",
+    nctype=NamedCType(name="reapply_views", type=BaseCType(boolT)),
+    argument=Argument(
+        name="reapply_views", type=BaseType(BaseTy.bool), default=None, annotation=None
+    ),
+    default=None,
+)
+
+InverseReturnModeT = BaseCppType("at::functionalization", "InverseReturnMode")
+inverse_return_mode_binding = Binding(
+    name="inverse_return_mode",
+    nctype=NamedCType(name="inverse_return_mode", type=BaseCType(InverseReturnModeT)),
+    argument=Argument(
+        name="inverse_return_mode",
+        # NB: not actually a bool but it doesn't matter because this isn't used
+        type=BaseType(BaseTy.bool),
+        default=None,
+        annotation=None,
+    ),
+    default=None,
+)
+
+
+# The lambda capture itself doesn't have a name.
+# The name returned here corresponds to the name of the inner function called by the lambda.
+def name(
+    g: NativeFunctionsViewGroup,
+    *,
+    is_reverse: bool,
+    include_namespace: bool,
+    reapply_views: Optional[bool] = None,
+) -> str:
+    if reapply_views is None:
+        # reapply_views is only important for the fwd lambda,
+        # since we always plumb the runtime "reapply_views" argument into the reverse function.
+        assert is_reverse
+    if is_reverse:
+        return reverse_name(g.view, include_namespace)
+    # in the forward case, we just directly call into the at::_ops API (so we always need the namespace)
+    assert include_namespace
+    assert g.view_copy is not None
+    api_name = (
+        g.view.func.name.unambiguous_name()
+        if reapply_views
+        else g.view_copy.func.name.unambiguous_name()
+    )
+    return f"at::_ops::{api_name}::call"
+
+
+def reverse_name(f: NativeFunction, include_namespace: bool) -> str:
+    # for the reverse: we plumb the "reapply_views" flag into that function and support
+    # both copy and non-copy variants. (We could avoid doing that, but that would require
+    # writing out twice as many view inverse functions).
+    api_name = f.func.name.unambiguous_name()
+    # in the reverse case, we codegen both the call-sites (which need the full namespace) and the declarations (which don't)
+    if include_namespace:
+        return f"at::functionalization::FunctionalInverses::{api_name}_inverse"
+    else:
+        return f"{api_name}_inverse"
+
+
+def capture_arguments(func: FunctionSchema, *, is_reverse: bool) -> List[Binding]:
+    # capture arguments include all arguments except `self`.
+    # Importantly, they don't include any C++ reference types (or else we'll get a dangling reference in the capture),
+    # So any reference types (IntArrayRef) need to be converted to value types (vector<int64_t>)
+    args = func.arguments.flat_all
+    assert args[0].type == BaseType(BaseTy.Tensor)
+    non_self_args = args[1:]
+    non_self_value_bindings = [
+        dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args
+    ]
+
+    all_bindings = [
+        inverse_return_mode_binding if is_reverse else reapply_views_binding
+    ]
+    all_bindings.extend(non_self_value_bindings)
+    return all_bindings
+
+
+def returns_type(func: FunctionSchema) -> CType:
+    # Assertion: all view ops return tensor-like outputs
+    assert len(func.returns) >= 1
+    for ret in func.returns:
+        assert ret.type.is_tensor_like()
+    # However, the return type of the lambda is always an individual tensor.
+    # For multi-tensor outputs, each tensor needs to be tracked individually.
+    return BaseCType(tensorT)
+
+
+def outer_arguments(*, is_reverse: bool) -> List[Binding]:
+    if is_reverse:
+        return [base_binding, mutated_view_binding, mutated_view_idx_binding]
+    else:
+        return [base_binding, mutated_view_idx_binding]
+
+
+def inner_call_index(func: FunctionSchema) -> Optional[Binding]:
+    # For view ops that return multiple tensors (like `split`), we generate a separate lambda for each output.
+    # When we replay a view op that returns multiple tensors, we need to index into the output appropriately
+    if len(func.returns) > 1 or (
+        len(func.returns) == 1 and func.returns[0].type.is_list_like()
+    ):
+        return mutated_view_idx_binding
+    return None
+
+
+def inner_arguments(func: FunctionSchema, is_reverse: bool) -> List[Binding]:
+    args = func.arguments.flat_all
+    assert args[0].type == BaseType(BaseTy.Tensor)
+    non_self_args = args[1:]
+    # The forward lambda calls the at::_ops API, while the reverse lambda calls the view inverse API.
+    # Both of these follow the dispatcher API.
+    non_self_bindings = [dispatcher.argument(a) for a in non_self_args]
+    if not is_reverse:
+        # the forward lambda swaps out the original tensor argument with the lambd arg "base"
+        return [base_binding] + non_self_bindings
+    else:
+        # the reverse lambda does the same, but with an additional "mutated_view" arg
+        # additionally, we have a calling convention: for view ops that return multiple tensor outputs
+        # their corresponding view_inverse function takes in an additional index argument.
+        index_binding = inner_call_index(func)
+        if index_binding is not None:
+            return [
+                base_binding,
+                mutated_view_binding,
+                inverse_return_mode_binding,
+                index_binding,
+            ] + non_self_bindings
+        else:
+            return [
+                base_binding,
+                mutated_view_binding,
+                inverse_return_mode_binding,
+            ] + non_self_bindings
diff --git a/MLPY/Lib/site-packages/torchgen/api/lazy.py b/MLPY/Lib/site-packages/torchgen/api/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a5ab81faade0655cd3edeae912822910709ef4f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/lazy.py
@@ -0,0 +1,464 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from torchgen.api.types import (
+    BaseCppType,
+    BaseCType,
+    boolT,
+    CType,
+    deviceT,
+    doubleT,
+    generatorT,
+    layoutT,
+    ListCType,
+    longT,
+    memoryFormatT,
+    NamedCType,
+    OptionalCType,
+    scalarT,
+    scalarTypeT,
+    stringT,
+    SymIntT,
+    VectorCType,
+)
+
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    OperatorName,
+    OptionalType,
+    Return,
+    TensorOptionsArguments,
+    Type,
+)
+
+
+_valueT: Optional[BaseCppType] = None
+
+
+# A ValueT is an IR type which represents the computation of a Tensor.  In other
+# words, a PyTorch user will do operations on lazy tensors, and each output lazy
+# tensor internally tracks a ValueT representing the IR node that would have
+# actually produced the value of this tensor for real.
+#
+# This is configurable because different lazy tensor backends (LTC vs XLA) will
+# have different IR representations.  (Though, arguably, after unification they
+# shouldn't!)
+def getValueT() -> BaseCppType:
+    global _valueT
+    if not _valueT:
+        raise NotImplementedError(
+            "The value type needs to be set with setValueT() in run_gen_lazy_tensor()"
+        )
+
+    return _valueT
+
+
+def setValueT(val: BaseCppType) -> None:
+    global _valueT
+    _valueT = val
+
+
+# this is a bad hack. I need to refactor the data model to represent each arg in the schema as an object,
+# making it easier to represent special properties of an arg.
+tensorListValueT = BaseCppType("torch::lazy", "Value")
+
+
+def process_ir_type(
+    typ: Type, properties: "LazyIrProperties", *, symint: bool
+) -> Union[BaseCType, VectorCType, OptionalCType, ListCType]:
+    """
+    This function takes a type from NativeFunctions and converts it for use with
+    lazy tensor codegen.
+
+    Type conversion for lazy currently consists of
+     (1) changing at::Tensors into lazy::Values
+     (2) wrapping everything in a BaseCType
+     (3) making cpp-reference types into cpp-value types (e.g. vector instead of IntArrayRef)
+
+    (1) converts at::Tensors to lazy::Values (which wrap lazy::Nodes, with which Lazy IR represents tensors.)
+    There is special handling for Optional[Tensor] or List[Tensor], etc- hence 'tensor-like'
+
+    This is incomplete- there are assertions in places that it's expected to need to add
+    more types as the codegen is used with more operators.
+    """
+    if isinstance(typ, BaseType):
+        if typ.name == BaseTy.Tensor:
+            return BaseCType(getValueT())
+        elif typ.name == BaseTy.Scalar:
+            if properties.TreatScalarsAsConstants:
+                return BaseCType(scalarT)
+            # at::scalar has special handling,
+            # and is wrapped in an lazy::Value just like at::tensor
+            return BaseCType(getValueT())
+        elif typ.name == BaseTy.ScalarType:
+            return BaseCType(scalarTypeT)
+        elif typ.name == BaseTy.int:
+            return BaseCType(longT)
+        elif typ.name == BaseTy.SymInt:
+            if symint:
+                return BaseCType(getValueT())
+            else:
+                return BaseCType(longT)
+        elif typ.name == BaseTy.bool:
+            return BaseCType(boolT)
+        elif typ.name == BaseTy.float:
+            return BaseCType(doubleT)
+        elif typ.name == BaseTy.str:
+            return BaseCType(stringT)
+        elif typ.name == BaseTy.Device:
+            return BaseCType(deviceT)
+        elif typ.name == BaseTy.Generator:
+            return BaseCType(generatorT)
+        elif typ.name == BaseTy.Layout:
+            return BaseCType(layoutT)
+        elif typ.name == BaseTy.MemoryFormat:
+            return BaseCType(memoryFormatT)
+        else:
+            raise AssertionError(f"TODO add support for type {repr(typ)}")
+    elif isinstance(typ, OptionalType):
+        return OptionalCType(process_ir_type(typ.elem, properties, symint=symint))
+    elif isinstance(typ, ListType):
+        if str(typ.elem) == "Tensor?":
+            # TODO(whc) is this actually correct? or should it use a Vector like above
+            return ListCType(OptionalCType(BaseCType(getValueT())))
+        elif str(typ.elem) == "Tensor":
+            # this is a TensorList which comes in from GetTensorList as a Value
+            return BaseCType(tensorListValueT)
+        elif typ.elem == BaseType(BaseTy.SymInt):
+            # TODO: return a value type.  The problem here is analogous to
+            # the problem with tensorListValueT: if you have SymInt[] you
+            # cannot conveniently save the list of Value directly, as nodes
+            # expect to save values as a vector for ALL arguments.  So you
+            # need a separate IR node that represents all of the size nodes
+            # assembled into a list.  I'm not an LTC dev so I don't want to
+            # figure it out right now.  Y'all figure it out...
+            return VectorCType(BaseCType(longT))
+
+        else:
+            return VectorCType(process_ir_type(typ.elem, properties, symint=symint))
+    else:
+        raise AssertionError(f"unrecognized type {repr(typ)}")
+
+
+# TODO: Determining this based off of CType is bad; this should be computed
+# from Type directly; then the same logic as process_ir_type can be used
+#
+# Invariant: passed typ should be an *owning* CType (e.g., we will report
+# that ArrayRef<Value> is NOT a value type)
+def isValueType(typ: CType, properties: "Optional[LazyIrProperties]" = None) -> bool:
+    """
+    Given a type, determine if it is a Value-like type.  This is equivalent to
+    being Tensor-like, but assumes the type has already been transformed.
+    """
+    if isinstance(typ, BaseCType):
+        # I am regretting my naming conventions, but now we are wrapping at::scalar in
+        # lazy value, while preserving other 'scalar' types as scalars in the IR
+        treat_scalars_as_constants = properties and properties.TreatScalarsAsConstants
+        return (
+            typ.type == getValueT()
+            or (typ.type == scalarT and not treat_scalars_as_constants)
+            or typ.type == SymIntT
+        )
+    elif typ == VectorCType(BaseCType(SymIntT)):
+        # TODO: report True for this
+        return False
+    elif isinstance(typ, (OptionalCType, ListCType, VectorCType)):
+        return isValueType(typ.elem, properties)
+    return False
+
+
+def isSymIntType(typ: Type) -> bool:
+    return isinstance(typ, BaseType) and typ.name == BaseTy.SymInt
+
+
+def isWrappedScalarType(typ: Type) -> bool:
+    """
+    Given a type, determine if it is a c10::scalar which we will wrap in a lazy Value.
+    Since we literally change the type from scalarT to valueT, information is lost.
+    This function helps build a list of wrapped scalars to save that information
+    """
+    if isinstance(typ, BaseType):
+        # I am regretting my naming conventions, but now we are wrapping at::scalar in
+        # lazy value, while preserving other 'scalar' types as scalars in the IR
+        return typ.name == BaseTy.Scalar
+    elif isinstance(typ, (OptionalType, ListType)):
+        return isWrappedScalarType(typ.elem)
+    return False
+
+
+# TODO: dedupe with Type.is_generator_like
+def isGeneratorType(typ: Type) -> bool:
+    if isinstance(typ, BaseType):
+        return typ.name == BaseTy.Generator
+    elif isinstance(typ, (OptionalType)):
+        return isGeneratorType(typ.elem)
+    return False
+
+
+# This class caches a few derived properties computed from an Argument
+# and LazyIrProperties
+class LazyArgument:
+    name: str
+    orig_type: Type
+    lazy_type_: Optional[CType]
+    is_wrapped_scalar: bool
+    is_generator: bool
+    # TODO: this is lies, it is false for symint list
+    is_symint_or_list: bool
+
+    # Whether or not we are treating this as symint or not
+    symint: bool
+
+    # true if this argument is or contains a lazy IR value
+    is_lazy_value: bool
+
+    def __init__(self, arg: Argument, properties: "LazyIrProperties", *, symint: bool):
+        self.name = arg.name
+        self.orig_type = arg.type
+        self.symint = symint
+        self.is_optional = isinstance(arg.type, OptionalType)
+        self.is_generator = isGeneratorType(arg.type)
+        self.lazy_type_ = process_ir_type(arg.type, properties, symint=symint)
+        self.is_wrapped_scalar = isWrappedScalarType(arg.type)
+        self.is_symint_or_list = symint and (
+            isSymIntType(arg.type)
+            or (isinstance(arg.type, OptionalType) and isSymIntType(arg.type.elem))
+            # TODO: lists of symints are not currently treated as value types
+            # or (isinstance(arg.type, ListType) and isSymIntType(arg.type.elem))
+        )
+
+        self.is_lazy_value = isValueType(self.lazy_type, properties)
+
+    @property
+    def lazy_type(self) -> CType:
+        assert (
+            self.lazy_type_ is not None
+        ), f"Attempted to access lazy_type for invalid argument {self.name}"
+        return self.lazy_type_
+
+
+class LazyIrProperties:
+    """Collection of properties for an IR node
+
+    The property groups are listed below. Each group is mutually
+    exclusive, meaning that only one property from each group can be True
+    at any one time. The properties can be accessed as if they were normal
+    attributes. The mutual exclusivity is automatically handled.
+    """
+
+    Properties: Tuple[Tuple[str, ...], ...] = (
+        (
+            "ShapePrecompute",  # Assume shape has been precomputed
+            "ShapeCompute",  # Need to compute the shape on construction
+            "ShapeCache",  # Utilize the shape cache to defer computation
+        ),
+        (
+            "Lower",  # Codegen full lower function
+            "LowerDeclOnly",  # Codegen only lower function declaration
+        ),
+        (
+            "CanBeReused",  # Codegen full reuse function
+            "CanBeReusedDeclOnly",  # Codegen only reuse function declaration
+        ),
+        (
+            "CreateFn",  # Codegen full create function
+            "CreateFnDeclOnly",  # Codegen only create function declaration
+        ),
+        (
+            "TreatScalarsAsConstants",  # Treat Scalars as constants instead of handling like values
+        ),
+    )
+
+    def __init__(self, *default_properties: str):
+        properties: Dict[Tuple[str, ...], Optional[str]] = dict.fromkeys(
+            LazyIrProperties.Properties
+        )
+        self.__dict__["properties"] = properties
+        for p in default_properties:
+            setattr(self, p, True)
+
+    def __getattr__(self, key: str) -> Any:
+        properties = self.__dict__["properties"]
+        for values in LazyIrProperties.Properties:
+            if key in values:
+                return properties[values] == key
+
+        return self.__getattribute__(key)
+
+    def __setattr__(self, key: str, value: Any) -> Any:
+        properties = self.__dict__["properties"]
+        for values in LazyIrProperties.Properties:
+            if key in values:
+                properties[values] = key if value else None
+                return value
+
+        raise KeyError(f"Invalid property: {key}")
+
+
+# Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node.
+# Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML),
+# but carries type information from a native FunctionSchema modified for use with IR nodes,
+# and preserving original argument names.
+#
+# TODO: This is not idiomatic with how other torchgen APIs transform on schema.
+class LazyIrSchema:
+    # The name of the operator this function schema describes.
+    name: "OperatorName"
+
+    positional_args: Tuple[LazyArgument, ...]
+    keyword_args: Tuple[LazyArgument, ...]
+
+    # TODO: Need to handle collisions with argument names at some point
+    returns: Tuple["Return", ...]
+
+    # if this schema has a Generator arg, list its orig ctype/name but don't
+    # build a LazyArgument since lazy IR doesn't support it
+    generator_arg: Optional[NamedCType] = None
+
+    # original function schema
+    func: FunctionSchema
+
+    # Whether or not we are code-genning for SymInt or not
+    symint: bool
+
+    properties: LazyIrProperties = LazyIrProperties(
+        # default properties
+        "ShapePrecompute",
+        "Lower",
+        "CanBeReused",
+    )
+    opkind: Optional[str] = None
+
+    def __init__(
+        self,
+        func: FunctionSchema,
+        properties: Optional[LazyIrProperties] = None,
+        *,
+        symint: bool,
+    ):
+        if properties:
+            self.properties = properties
+
+        self.func = func
+        self.symint = symint
+        positional_args: List[LazyArgument] = []
+        for arg_field in ["pre_self_positional", "self_arg", "post_self_positional"]:
+            if arg_field == "self_arg" and func.arguments.self_arg is not None:
+                arg = func.arguments.self_arg.argument
+                positional_args.append(
+                    LazyArgument(arg, self.properties, symint=symint)
+                )
+            elif getattr(func.arguments, arg_field) is not None:
+                positional_args.extend(
+                    LazyArgument(arg, self.properties, symint=symint)
+                    for arg in getattr(func.arguments, arg_field)
+                )
+        self.positional_args = tuple(positional_args)
+
+        keyword_args: List[LazyArgument] = []
+        for arg_field in [
+            "pre_tensor_options_kwarg_only",
+            "tensor_options",
+            "post_tensor_options_kwarg_only",
+            "out",
+        ]:
+            curr_args = getattr(func.arguments, arg_field)
+            if curr_args is not None:
+                if isinstance(curr_args, TensorOptionsArguments):
+                    curr_args = curr_args.all()
+                for arg in curr_args:
+                    if isGeneratorType(arg.type):
+                        assert (
+                            self.generator_arg is None
+                        ), "We expect there is only one generator arg"
+                        self.generator_arg = NamedCType(
+                            arg.name, arg.type  # type:ignore[arg-type]
+                        )
+                keyword_args.extend(
+                    LazyArgument(arg, self.properties, symint=symint)
+                    for arg in curr_args
+                )
+        self.keyword_args = tuple(keyword_args)
+        self.name = func.name
+        self.returns = func.returns
+
+    @property
+    def node_name(self) -> str:
+        """
+        Return camel-case version of op in node.
+
+        Note: This function also appends any `overload_name` in the operation.
+        For example, if the op is `bitwise_and.Tensor`, the returned name
+        will be `BitwiseAndTensor`.
+        """
+        op_name = f"{self.name.name}_{self.name.overload_name}".lower()
+        return "".join(word.capitalize() or "" for word in op_name.split("_"))
+
+    @property
+    def aten_name(self) -> str:
+        return str(self.name.name)
+
+    @property
+    def base_name(self) -> str:
+        return f"{self.name.name.base}"
+
+    def filtered_args(
+        self,
+        positional: bool = True,
+        keyword: bool = True,
+        values: bool = True,
+        scalars: bool = True,
+        generator: bool = True,
+    ) -> List[LazyArgument]:
+        # This function maintains the sorted order of arguments but provides different filtered views.
+        # Some parts of the code care about kwargs vs args (TS lowerings),
+        # other parts care about whether they need to wrap the arg in a lazy value or leave it alone.
+        # Generators are special cased, as they are needed for fallback/shape-inference but not supported
+        # in TS lowerings and therefore also omitted from lazy IR.
+        args: List[LazyArgument] = []
+        if positional:
+            args.extend(self.positional_args)
+        if keyword:
+            args.extend(self.keyword_args)
+
+        if values and scalars and generator:
+            return args
+        elif values and scalars:
+            return [a for a in args if not a.is_generator]
+        elif values:
+            return [a for a in args if a.is_lazy_value]
+        elif scalars:
+            return [
+                a
+                for a in args
+                if not a.is_lazy_value and (generator or not a.is_generator)
+            ]
+
+        return []
+
+    @property
+    def positional_values(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=True, keyword=False, values=True, scalars=False
+        )
+
+    @property
+    def positional_scalars(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=True, keyword=False, values=False, scalars=True
+        )
+
+    @property
+    def keyword_values(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=False, keyword=True, values=True, scalars=False
+        )
+
+    @property
+    def keyword_scalars(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=False, keyword=True, values=False, scalars=True
+        )
diff --git a/MLPY/Lib/site-packages/torchgen/api/meta.py b/MLPY/Lib/site-packages/torchgen/api/meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..40792a04e9c72397e70118730e55be5e9815723e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/meta.py
@@ -0,0 +1,12 @@
+from torchgen.model import NativeFunctionsGroup
+
+# Follows dispatcher calling convention, but:
+#   - Mutable arguments not allowed.  Meta functions are always
+#     written in functional form.  Look at FunctionSchema.signature()
+#   - No tensor returns; instead we return a TensorMeta describing
+#     the tensor in question
+
+
+def name(g: NativeFunctionsGroup) -> str:
+    # use the overload name from the functional version
+    return str(g.functional.func.name).replace(".", "_")
diff --git a/MLPY/Lib/site-packages/torchgen/api/native.py b/MLPY/Lib/site-packages/torchgen/api/native.py
new file mode 100644
index 0000000000000000000000000000000000000000..1138cb19329b34cf1b781d06d16e856e7e2ec1ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/native.py
@@ -0,0 +1,153 @@
+from typing import List, Optional, Sequence, Union
+
+from torchgen import local
+from torchgen.api import cpp
+
+from torchgen.api.types import (
+    ArgName,
+    BaseCType,
+    Binding,
+    boolT,
+    ConstRefCType,
+    CType,
+    deviceT,
+    layoutT,
+    ListCType,
+    MutRefCType,
+    NamedCType,
+    OptionalCType,
+    scalarT,
+    scalarTypeT,
+    tensorT,
+)
+from torchgen.model import (
+    Argument,
+    FunctionSchema,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import assert_never
+
+# This file describes the translation of JIT schema to the native functions API.
+# This looks a lot like the C++ API (which makes historical sense, because the
+# idea was you wrote native functions to implement functions in the C++ API),
+# but over time we have evolved the C++ API without actually changing our
+# native:: kernels.  The intention is to make native API and dispatcher API
+# line up as closely as possible, since this results in the least overhead
+# (no translation is needed from dispatcher API to native API).
+#
+# NB: this is symint aware, you will get the non-SymInt variant for some
+# dispatch entries and SymInt for others.
+
+
+def name(func: FunctionSchema) -> str:
+    name = str(func.name.name)
+    # TODO: delete this!
+    if func.is_out_fn():
+        name += "_out"
+    if func.name.overload_name:
+        name += f"_{func.name.overload_name}"
+    return name
+
+
+def argumenttype_type(
+    t: Type, *, mutable: bool, binds: ArgName, symint: bool
+) -> NamedCType:
+    if str(t) == "Tensor?":
+        tensor_type: OptionalCType = OptionalCType(BaseCType(tensorT))
+        if mutable and not local.use_const_ref_for_mutable_tensors():
+            return NamedCType(binds, MutRefCType(tensor_type))
+        else:
+            return NamedCType(binds, ConstRefCType(tensor_type))
+    elif str(t) == "Tensor?[]":
+        return NamedCType(
+            binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT))))
+        )
+    elif str(t) == "Scalar":
+        return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+    elif str(t) == "Scalar?":
+        return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
+    return cpp.argumenttype_type(t, mutable=mutable, binds=binds, symint=symint)
+
+
+def returns_type(rs: Sequence[Return], *, symint: bool) -> CType:
+    return cpp.returns_type(rs, symint=symint)
+
+
+def argument_type(a: Argument, *, binds: ArgName, symint: bool) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, binds=binds, symint=symint)
+
+
+def argument(
+    a: Union[Argument, SelfArgument, TensorOptionsArguments],
+    *,
+    is_out: bool,
+    symint: bool,
+) -> List[Binding]:
+    # Ideally, we NEVER default native functions.  However, there are a number
+    # of functions that call native:: directly and rely on the defaulting
+    # existing.  So for BC, we generate defaults for non-out variants (but not
+    # for out variants, where it is impossible to generate an appropriate
+    # default)
+    should_default = not is_out
+    if isinstance(a, Argument):
+        default: Optional[str] = None
+        if should_default and a.default is not None:
+            default = cpp.default_expr(a.default, a.type, symint=symint)
+        return [
+            Binding(
+                nctype=argument_type(a, binds=a.name, symint=symint),
+                name=a.name,
+                default=default,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, SelfArgument):
+        # Erase SelfArgument from the distinction
+        return argument(a.argument, is_out=is_out, symint=symint)
+    elif isinstance(a, TensorOptionsArguments):
+        default = None
+        if should_default:
+            default = "{}"
+        # TODO: Not sure why the arguments assigned here are for
+        # TensorOptionsArguments and not the constituent pieces.  It seems
+        # to matter
+        return [
+            Binding(
+                nctype=NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))),
+                name="dtype",
+                default=default,
+                argument=a,
+            ),
+            Binding(
+                nctype=NamedCType("layout", OptionalCType(BaseCType(layoutT))),
+                name="layout",
+                default=default,
+                argument=a,
+            ),
+            Binding(
+                nctype=NamedCType("device", OptionalCType(BaseCType(deviceT))),
+                name="device",
+                default=default,
+                argument=a,
+            ),
+            Binding(
+                nctype=NamedCType("pin_memory", OptionalCType(BaseCType(boolT))),
+                name="pin_memory",
+                default=default,
+                argument=a,
+            ),
+        ]
+    else:
+        assert_never(a)
+
+
+def arguments(func: FunctionSchema, *, symint: bool) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    args.extend(func.arguments.non_out)
+    args.extend(func.arguments.out)
+    return [
+        r for arg in args for r in argument(arg, symint=symint, is_out=func.is_out_fn())
+    ]
diff --git a/MLPY/Lib/site-packages/torchgen/api/python.py b/MLPY/Lib/site-packages/torchgen/api/python.py
new file mode 100644
index 0000000000000000000000000000000000000000..26a6a1f2587ecd7a6ac700940da21bdaf1c7ee70
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/python.py
@@ -0,0 +1,1509 @@
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence, Set, Tuple, Union
+
+from torchgen.api import cpp
+
+from torchgen.api.types import Binding, CppSignature, CppSignatureGroup
+from torchgen.gen import pythonify_default
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Return,
+    Type,
+    Variant,
+)
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           Data Models
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+# [Notes] python binding codegen
+#
+# The Python binding codegen produces code that takes the input list of
+# PyObjects, finds the matching ATen C++ function using PythonArgParser,
+# converts the PyObjects into C++ types and calls the ATen C++ function:
+#
+# +--------+  parsing   +------------------------+  binding   +-----------------------+
+# | PyObjs | ---------> | PythonArgParser Output | ---------> | Cpp Function Dispatch |
+# +--------+            +------------------------+            +-----------------------+
+#
+# The following examples demonstrate the data models the Python binding
+# codegen needs to deal with and the tasks it needs to accomplish. It
+# helps understand the purpose of the new data types we introduced below.
+#
+#  - Function Schema (source of truth)
+#
+#      aten::empty.names(int[] size, *, Dimname[]? names,
+#                        ScalarType? dtype=None, Layout? layout=None,
+#                        Device? device=None, bool? pin_memory=None,
+#                        MemoryFormat? memory_format=None) -> Tensor
+#
+#  - Python Signature
+#
+#    It's used to generate input schema string for PythonArgParser.
+#    Note: TensorOptions fields are reordered and the additional
+#    'requires_grad' field is added:
+#
+#      empty(IntArrayRef size, *, DimnameList? names,
+#            MemoryFormat? memory_format=None, ScalarType dtype=None,
+#            Layout layout=torch.strided, Device device=None,
+#            bool pin_memory=False, bool requires_grad=False)
+#
+#  - C++ Signature
+#
+#    It's used to generate C++ lambda formals & dispatch call.
+#    Note: the scattered TensorOptions fields are packed into 'options'.
+#
+#      auto dispatch_empty =
+#          [](IntArrayRef size, c10::optional<DimnameList> names,
+#             const TensorOptions & options,
+#             c10::optional<MemoryFormat> memory_format) -> Tensor {
+#          pybind11::gil_scoped_release no_gil;
+#          return torch::empty(size, names, options, memory_format);
+#      };
+#
+#  - Binding between Python Arguments and C++ Arguments
+#
+#    Given a set of Python Arguments in scope, we need produce the
+#    binding expressions that translate the Python API into C++ API:
+#
+#            Python Args               Cpp Args       Binding Exprs
+#     -----------------------------------------------------------------
+#         0: size                      size           '_r.intlist(0)'
+#         1: names                     names          'names' [special init]
+#         2: memory_format -------+
+#         3: dtype         -----+-|--> options        'options' [special packing]
+#         4: layout            /  |
+#         5: device           /   +--> memory_format  '_r.memoryformatOptional(2)'
+#         6: pin_memory      /
+#         7: requires_grad -+
+#
+#    So the full dispatch expression would look like:
+#
+#      dispatch_empty(_r.intlist(0), names, options,
+#                     _r.memoryformatOptional(2))
+#
+#    Where does 'names' come from? It involves special local init:
+#
+#      auto __names = _r.toDimnameListOptional(1);
+#      c10::optional<DimnameList> names =
+#          __names ? c10::make_optional(DimnameList(__names.value()))
+#                  : c10::nullopt;
+#
+#    Where does 'options' come from? It involves special local init
+#    for TensorOptions. Note that Python side has the additional
+#    'requires_grad' field:
+#
+#      const auto options = TensorOptions()
+#          .dtype(_r.scalartype(3))
+#          .device(_r.device(5))
+#          .layout(_r.layoutOptional(4))
+#          .requires_grad(_r.toBool(7))
+#          .pinned_memory(_r.toBool(6));
+#
+#    In some other cases one Python Argument can map to multiple C++
+#    Arguments. For example:
+#
+#     aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False)
+#       -> (Tensor values, Tensor indices)
+#
+#            Python Args               Cpp Args          Binding Exprs
+#     ---------------------------------------------------------------------
+#                               +----> max               'out[0]'
+#                              /-----> max_values        'out[1]
+#         0: input            /        self              '_r.tensor(0)'
+#         1: dim             /         dim               '_r.dimname(1)'
+#         2: keepdim        /          keepdim           '_r.toBool(2)'
+#         3: out      -----+           [local init] out  '_r.tensorlist_n<2>(3)'
+#
+#    As demonstrated above, the binding can involve reordering,
+#    packing, unpacking and special local inits.
+#
+#
+#  Let's look at a concrete example:
+#
+#      static PythonArgParser parser({
+#        "abs(Tensor input, *, Tensor out=None)",
+#        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#         ^
+#         +--- Python Schema, represented by PythonSignature and PythonArgument
+#
+#      }, /*traceable=*/true);
+#
+#      ParsedArgs<2> parsed_args;
+#      auto _r = parser.parse(nullptr, args, kwargs, parsed_args);
+#
+#      ...
+#
+#      if (_r.isNone(1)) {
+#          ~~~~~~~~~~~~  <--- Scattered PythonArgParser output (arg name = 'out')
+#                             represented by PythonArgParserOutputExpr
+#
+#        // aten::abs(Tensor self) -> Tensor
+#        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#         ^
+#         +--- NativeFunction schema, base version
+#
+#        auto dispatch_abs = [](const Tensor & self) -> Tensor {
+#                            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#                             ^
+#                             +--- dispatch_lambda_args / dispatch_lambda_return_str
+#                                  generated from NativeFunction / CppSignature
+#                                  (deprecated PythonSignature is special)
+#                                  arguments are represented by DispatchLambdaArgument
+#
+#          pybind11::gil_scoped_release no_gil;
+#          return self.abs();
+#                 ~~~~~~~~~~~  <--- cpp_dispatch_target / cpp_dispatch_exprs
+#                                   generated from NativeFunction / CppSignature
+#        };
+#        return wrap(dispatch_abs(_r.tensor(0)));
+#                                 ~~~~~~~~~~~~~
+#                                  ^
+#                                  +--- dispatch_lambda_exprs
+#                                       binding PythonArgParserOutputExpr (python args)
+#                                       and DispatchLambdaArgument (c++ args)
+#
+#      } else {
+#        // aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+#        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#         ^
+#         +--- NativeFunction schema, out-variant
+#
+#        auto dispatch_abs_out = [](Tensor out, const Tensor & self) -> Tensor {
+#          pybind11::gil_scoped_release no_gil;
+#          return at::abs_out(out, self);
+#        };
+#        return wrap(dispatch_abs_out(_r.tensor(1), _r.tensor(0)));
+#      }
+#
+#
+# [Notes] python interface codegen
+# The python dataclasses below are used used to generate both python binding code
+# and pyi type hint signatures.
+# In theory these two should look very similar, but there are number of differences
+# in how pyi signatures vs. python_arg_parser signatures are generated.
+# These differences have been encapsulated in signature_str() vs. signature_str_pyi()
+# to display the full signatures, and argument_str() vs argument_str_pyi() to display arguments.
+# For examples, only pyi signatures include return types.
+
+
+@dataclass(frozen=True)
+class PythonReturns:
+    returns: Tuple[Return, ...]
+
+
+@dataclass(frozen=True)
+class PythonArgument:
+    name: str
+    type: Type
+    default: Optional[str]
+
+    # Used to generate the default init expr for some PythonArgParser outputs, e.g.:
+    #
+    #   _r.layoutWithDefault(3, layout_from_backend(self.options().backend())))
+    #                           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    #                            ^
+    #                            +--- default_init str
+    default_init: Optional[str]
+
+    # Compute argument formal for python argument parsing.
+    # Needs to be consistent with torch/csrc/utils/python_arg_parser.h.
+    def argument_str(self, *, method: bool = False, symint: bool = True) -> str:
+        type_str = (
+            argument_type_str(self.type, symint=symint)
+            .replace("const ", "")
+            .replace(" &", "")
+        )
+
+        name = self.name
+        # s/self/input/ outside method bindings
+        # [old codegen] TODO: remove this? doesn't rename in codegen, it's just
+        # for the parse string
+        if name == "self" and type_str in ["Tensor", "Number"] and not method:
+            name = "input"
+
+        # add default
+        if self.default is not None:
+            default = {
+                "nullptr": "None",
+                "c10::nullopt": "None",
+                "{}": "None",
+            }.get(self.default, self.default)
+            return f"{type_str} {name}={default}"
+        else:
+            return f"{type_str} {name}"
+
+    def argument_str_pyi(
+        self, *, method: bool = False, deprecated: bool = False
+    ) -> str:
+        type_str = argument_type_str_pyi(self.type)
+
+        name = self.name
+        # s/self/input/ outside method bindings
+        # [old codegen] TODO: remove this? doesn't rename in codegen, it's just
+        # for the parse string
+        if name == "self" and type_str == "Tensor" and not method and not deprecated:
+            name = "input"
+
+        if name == "from":  # from is a Python keyword...
+            name += "_"
+
+        # pyi merges the _out and functional variants into the same signature, with an optional out arg
+        if name == "out" and type_str == "Tensor" and not deprecated:
+            type_str = "Optional[" + type_str + "]"
+
+        # pyi deprecated signatures don't get defaults for their out arg
+        treat_as_no_default = (
+            deprecated
+            and isinstance(self, PythonOutArgument)
+            and self.default == "None"
+        )
+
+        # add default
+        if self.default is not None and not treat_as_no_default:
+            if (
+                isinstance(self.type, ListType)
+                and self.type.elem == BaseType(BaseTy.int)
+                and self.default.startswith("{")
+                and self.default.endswith("}")
+            ):
+                default = "(" + self.default[1:-1] + ")"
+            else:
+                default = {
+                    "nullptr": "None",
+                    "c10::nullopt": "None",
+                    "{}": "None",
+                    "MemoryFormat::Contiguous": "contiguous_format",
+                    "QScheme::PER_TENSOR_AFFINE": "per_tensor_affine",
+                }.get(self.default, self.default)
+            return f"{name}: {type_str} = {default}"
+        else:
+            return f"{name}: {type_str}"
+
+
+@dataclass(frozen=True)
+class PythonOutArgument(PythonArgument):
+    # In Python signature multiple output fields are packed into one 'out' argument.
+    # When binding to C++, it's first binded to a local 'out' variable:
+    #   'auto out = _r.tensorlist_n<2>(2);',
+    # then binded to scattered C++ output arguments as 'out[0]', 'out[1]', and etc.
+    # TODO: maybe don't need keep scattered out fields for python signature?
+    outputs: Tuple[PythonArgument, ...]
+
+    @staticmethod
+    def from_outputs(
+        outputs: Tuple[PythonArgument, ...]
+    ) -> Optional["PythonOutArgument"]:
+        if not outputs:
+            return None
+
+        size = len(outputs)
+        if size == 1:
+            return PythonOutArgument(
+                name=outputs[0].name,
+                type=outputs[0].type,
+                default="None",
+                default_init=None,
+                outputs=outputs,
+            )
+        elif size > 1:
+            if any(not a.type.is_tensor_like() for a in outputs):
+                raise RuntimeError(f"Unsupported output type: {outputs}")
+            return PythonOutArgument(
+                name="out",
+                # TODO: shouldn't this be OptionalType[ListType[...]], since it defaults to None?
+                type=ListType(BaseType(BaseTy.Tensor), size),
+                default="None",
+                default_init=None,
+                outputs=outputs,
+            )
+        raise AssertionError(r"Unexpected PythonOutArgument size")
+
+
+@dataclass(frozen=True)
+class PythonSignature:
+    # Base operator name, without inplace/outplace suffix.
+    name: str
+
+    # Positional arguments.
+    # TODO: create a dedicated SelfArgument type for 'self'?
+    input_args: Tuple[PythonArgument, ...]
+
+    # Keyword arguments excluding the 'out' argument and scattered kwargs belonging
+    # to TensorOptions (dtype, layout, device, pin_memory, requires_grad, etc).
+    input_kwargs: Tuple[PythonArgument, ...]
+
+    output_args: Optional[PythonOutArgument]
+
+    # Return types, which are only used by pyi
+    returns: PythonReturns
+
+    # These are scattered kwargs arguments belonging to TensorOptions.
+    # When binding to C++, they are packed into a TensorOptions object 'options'.
+    # It's possible that the C++ signature doesn't take TensorOptions object (e.g.
+    # for out variant), in which case they will be used as scattered fields without
+    # being packed into 'options'.
+    # TODO: maybe create a PythonTensorOptionsArgument?
+    tensor_options_args: Tuple[PythonArgument, ...]
+
+    # method or function signature?
+    method: bool
+
+    @property
+    def deprecated(self) -> bool:
+        return False
+
+    def arguments(
+        self, *, skip_outputs: bool = False, skip_tensor_options: bool = False
+    ) -> Tuple[Union[PythonArgument, PythonOutArgument], ...]:
+        result: List[Union[PythonArgument, PythonOutArgument]] = []
+        result.extend(self.input_args)
+        result.extend(self.input_kwargs)
+        if self.output_args is not None and not skip_outputs:
+            result.append(self.output_args)
+        if not skip_tensor_options:
+            result.extend(self.tensor_options_args)
+        return tuple(result)
+
+    def arguments_count(self) -> int:
+        return len(self.arguments())
+
+    def output_idx(self) -> int:
+        return len(self.input_args) + len(self.input_kwargs)
+
+    # [old codegen] Compute the Python function signature for argument parsing,
+    # as specified in torch/csrc/utils/python_arg_parser.h.  WARNING:
+    # this is NOT the same type signature as specified by PEP 484
+    # as understood by mypy; our format was independently developed
+    # and has some quirks to make it more suitable specifically
+    # for error parsing.
+    #
+    # For a translation to mypy-valid type signatures, see
+    # signature_str_pyi().
+    def signature_str(self, *, skip_outputs: bool = False, symint: bool = True) -> str:
+        args = self.arguments(skip_outputs=skip_outputs)
+        schema_formals: List[str] = [
+            a.argument_str(method=self.method, symint=symint) for a in args
+        ]
+        positional_argc = len(self.input_args)
+        if len(schema_formals) > positional_argc:
+            schema_formals.insert(positional_argc, "*")
+
+        return f'{self.name}({", ".join(schema_formals)})'
+
+    def signature_str_pyi(self, *, skip_outputs: bool = False) -> str:
+        args = self.arguments(skip_outputs=skip_outputs)
+        schema_formals: List[str] = [
+            a.argument_str_pyi(method=self.method) for a in args
+        ]
+        positional_argc = len(self.input_args)
+        if len(schema_formals) > positional_argc:
+            schema_formals.insert(positional_argc, "*")
+
+        # only pyi signatures include returns
+        returns_str = returns_str_pyi(self)
+        # pyi also includes self (with no typing/defaults) for methods
+        if self.method:
+            schema_formals.insert(0, "self")
+        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
+
+    def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> Optional[str]:
+        # only pyi uses vararg signatures
+        args = self.arguments(skip_outputs=skip_outputs)
+        schema_formals: List[str] = [
+            a.argument_str_pyi(method=self.method) for a in args
+        ]
+        # vararg only applies to pyi signatures. vararg variants are not generated for all signatures
+        num_args = self.arguments_count()
+        num_positionalargs = len(self.input_args)
+
+        have_vararg_version = False
+        if num_args > 0:
+            vararg_type = args[0].type
+            if (
+                isinstance(vararg_type, ListType)
+                and str(vararg_type.elem) in ["int", "SymInt"]
+                and num_positionalargs == 1
+            ):
+                have_vararg_version = True
+
+        if not have_vararg_version:
+            return None
+        # Below are the major changes in vararg vs. regular pyi signatures
+        # vararg signatures also omit the asterix
+        schema_formals[0] = "*" + args[0].name + ": _int"
+
+        returns_str = returns_str_pyi(self)
+        # pyi also includes self (with no typing/defaults) for methods
+        if self.method:
+            schema_formals.insert(0, "self")
+        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
+
+
+# The deprecated python signature involves some special logic, so create a
+# dedicated data model to store these extra properties.
+@dataclass(frozen=True)
+class PythonSignatureDeprecated(PythonSignature):
+    # Schema for the deprecated function
+    deprecated_schema: FunctionSchema
+
+    # The deprecated signature might miss some arguments that the corresponding
+    # C++ signature expects. We need store the constant default values to pass in.
+    # For example:
+    #   [deprecate signature]: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2)
+    #   [func schema]: aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    #   [func call]: self.addmm(mat1, mat2, beta, 1)
+    # We store ['self', 'mat1', 'mat2', 'beta', '1'] in this case.
+    deprecated_args_exprs: Tuple[str, ...]
+
+    @property
+    def deprecated(self) -> bool:
+        return True
+
+    def signature_str(self, *, skip_outputs: bool = False, symint: bool = True) -> str:
+        return (
+            PythonSignature.signature_str(
+                self, skip_outputs=skip_outputs, symint=symint
+            )
+            + "|deprecated"
+        )
+
+    def signature_str_pyi(self, *, skip_outputs: bool = False) -> str:
+        args = self.arguments(skip_outputs=skip_outputs)
+        schema_formals: List[str] = [
+            a.argument_str_pyi(method=self.method, deprecated=True) for a in args
+        ]
+        positional_argc = len(self.input_args)
+        if len(schema_formals) > positional_argc:
+            schema_formals.insert(positional_argc, "*")
+
+        returns_str = returns_str_pyi(self)
+        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
+
+    def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> Optional[str]:
+        # the codegen doesn't include vararg variants for deprecated signatures
+        return None
+
+
+# This struct is used to hold the PythonSignature and its corresponding
+# NativeFunction BEFORE grouping base and out-variant functions.
+# Why not store NativeFunction in PythonSignature or construct PythonSignature
+# from NativeFunction? Because they are not 1-1 mapped.
+# One native function could have both deprecated and non-deprecated python
+# signatures - NativeFunction doesn't contain information to construct the
+# deprecated python signature.
+# One python signature is used to handle both the base and the out-variant
+# function - see 'PythonSignatureGroup'.
+@dataclass(frozen=True)
+class PythonSignatureNativeFunctionPair:
+    signature: PythonSignature
+    function: NativeFunction
+
+
+# We merge pairs of functions with signatures that are equivalent mod
+# output arguments, and use a single entry in the python_arg_parser sig
+# list for both (output arguments become optional).
+@dataclass(frozen=True)
+class PythonSignatureGroup:
+    # The signature used for Python argument parsing. The outplace signature
+    # is preferred if exists, because it can be used to parse inputs for both
+    # the out-place variant and the base version (with output omitted).
+    signature: PythonSignature
+
+    # The regular ATen declaration (e.g. conv2d)
+    base: NativeFunction
+
+    # The out variant (e.g. conv2d_out)
+    outplace: Optional[NativeFunction]
+
+    @classmethod
+    def from_pairs(
+        cls,
+        functional: PythonSignatureNativeFunctionPair,
+        out: Optional[PythonSignatureNativeFunctionPair],
+    ) -> "PythonSignatureGroup":
+        if out is None:
+            return PythonSignatureGroup(
+                signature=functional.signature,
+                base=functional.function,
+                outplace=None,
+            )
+
+        # prefer the signature with optional out=... arguments because it's the
+        # superset that can be used to parse input for both base and outplace.
+        signature_kwargs = out.signature.__dict__.copy()
+
+        # Out overloads in C++ don't have TensorOptions arguments,
+        # so take these from the functional variant
+        signature_kwargs[
+            "tensor_options_args"
+        ] = functional.signature.tensor_options_args
+
+        return PythonSignatureGroup(
+            signature=type(out.signature)(**signature_kwargs),
+            base=functional.function,
+            outplace=out.function,
+        )
+
+
+# C++ function dispatch is wrapped in a lambda function. The lambda function
+# has almost the same signature as the C++ function, only with some small
+# variants - see details below.
+# This data model is used to represent arguments of the lambda function
+# signature.
+@dataclass(frozen=True)
+class DispatchLambdaArgument:
+    name: str
+    type_str: str
+    is_out_arg: bool
+
+
+# To pass PyObjects arguments to C++ function (via the lambda wrapper),
+# we need first convert PyObjects into simple C++ objects. This work
+# is done by PythonArgParser.
+# This data model is used to represent the output of PythonArgParser.
+# It has 1-1 mapping with PythonArgument in PythonSignature.
+@dataclass(frozen=True)
+class PythonArgParserOutputExpr:
+    # argument name
+    name: str
+
+    # RHS expression to reference PythonArgParser output.
+    expr: str
+
+    # In some special cases we need create different expr, e.g.:
+    # '_r.isNone(1)' instead of '_r.tensor(1)'.
+    index: int
+
+    # The python argument it maps to.
+    argument: PythonArgument
+
+    @property
+    def is_none_expr(self) -> str:
+        return f"_r.isNone({self.index})"
+
+
+# To pass PythonArgParser output to the lambda wrapper, we need bind
+# PythonArgParserOutputExpr to DispatchLambdaArgument.
+# They are not always 1-1 mapped, e.g. scattered TensorOptions fields
+# need be packed into a TensorOptions object, which is the argument
+# that the lambda function wrapper takes.
+@dataclass(frozen=True)
+class DispatchLambdaArgumentExprs:
+    # The exprs that provide the binding for lambda arguments, e.g.:
+    #
+    #   'self' -> '_r.tensor(0)'
+    #   'min' -> 'out[0]' / 'min_indices' -> 'out[1]'
+    #   'options' -> 'options'
+    #
+    # It has 1-1 mapping with DispatchLambdaArgument.
+    exprs: Sequence[str]
+
+    # Special local inits, which might introduce new variables that
+    # the 'exprs' above reference, e.g.:
+    #
+    #   'auto out = _r.tensorlist_n<2>(2);'
+    #
+    inits: Sequence[str]
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                          Helper Functions
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature:
+    return CppSignatureGroup.from_native_function(f, method=method).signature
+
+
+def has_tensor_options(f: NativeFunction) -> bool:
+    return f.func.arguments.tensor_options is not None
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                          Python Signature
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+# 'simple_type' was introduced by the old codegen, which is slightly
+# different from the python schema type, e.g.: doesn't have '?' suffix
+# for optional Tensor/TensorList; doesn't have '[size]' suffix for list type.
+def argument_type_str(
+    t: Type, *, simple_type: bool = False, symint: bool = True
+) -> str:
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            return "Tensor"
+        elif t.name == BaseTy.int:
+            return "int64_t"
+        elif t.name == BaseTy.float:
+            return "double"
+        elif t.name == BaseTy.str:
+            return "c10::string_view"
+        elif t.name in [
+            BaseTy.bool,
+            BaseTy.QScheme,
+            BaseTy.Scalar,
+            BaseTy.ScalarType,
+            BaseTy.Generator,
+            BaseTy.Storage,
+            BaseTy.Layout,
+            BaseTy.Device,
+            BaseTy.DeviceIndex,
+            BaseTy.MemoryFormat,
+            BaseTy.Dimname,
+            BaseTy.Stream,
+            BaseTy.ConstQuantizerPtr,
+            BaseTy.SymInt,
+        ]:
+            # These python schema type names line up with their function schema names
+            return t.name.name
+
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == "Tensor":
+            # Is it desired to keep '?' for simple_type with new style dispatcher?
+            return "Tensor?"
+        elem = argument_type_str(t.elem, simple_type=simple_type, symint=symint)
+        return f"{elem}?"
+    elif isinstance(t, ListType):
+        size = t.size if not simple_type else None
+        if str(t.elem) == "bool":
+            assert t.size is not None
+            return f"::std::array<bool,{t.size}>"
+        elif str(t.elem) == "int":
+            return f"IntArrayRef[{size}]" if size is not None else "IntArrayRef"
+        elif str(t.elem) == "SymInt":
+            if symint:
+                return (
+                    f"SymIntArrayRef[{size}]" if size is not None else "SymIntArrayRef"
+                )
+            else:
+                return f"IntArrayRef[{size}]" if size is not None else "IntArrayRef"
+        elif str(t.elem) == "Tensor":
+            return f"TensorList[{size}]" if size is not None else "TensorList"
+        elif str(t.elem) == "Scalar":
+            return f"ScalarList[{size}]" if size is not None else "ScalarList"
+        elif str(t.elem) == "Tensor?":
+            if simple_type:
+                return "c10::List<c10::optional<Tensor>>"
+            else:
+                return "const c10::List<c10::optional<Tensor>> &"
+        elif str(t.elem) == "Dimname":
+            return f"DimnameList[{size}]" if size is not None else "DimnameList"
+        elem = argument_type_str(t.elem, simple_type=simple_type, symint=symint)
+        return f"ArrayRef<{elem}>"
+
+    raise RuntimeError(f"unrecognized type {repr(t)}")
+
+
+def argument_type_size(t: Type) -> Optional[int]:
+    l = t.is_list_like()
+    if l is not None and str(l.elem) != "bool":
+        return l.size
+    else:
+        return None
+
+
+def argument(a: Argument) -> PythonArgument:
+    return PythonArgument(
+        name=a.name,
+        type=a.type,
+        # TODO: directly translate a.default to python default
+        default=str(
+            pythonify_default(cpp.default_expr(a.default, a.type, symint=False))
+        )
+        if a.default is not None
+        else None,
+        default_init=None,
+    )
+
+
+# Generates a PythonSignature that can be used for either .pyi or PythonArgParser codegen
+def signature(
+    f: NativeFunction, *, method: bool = False, pyi: bool = False
+) -> PythonSignature:
+    return signature_from_schema(
+        f.func, category_override=f.category_override, method=method, pyi=pyi
+    )
+
+
+def signature_from_schema(
+    func: FunctionSchema,
+    *,
+    category_override: Optional[str],
+    method: bool = False,
+    pyi: bool = False,
+) -> PythonSignature:
+    args: List[Argument] = []
+    args.extend(func.arguments.pre_self_positional)
+    # Skip SelfArgument if this is method.
+    if not method and func.arguments.self_arg is not None:
+        args.append(func.arguments.self_arg.argument)
+    args.extend(func.arguments.post_self_positional)
+    args.extend(func.arguments.pre_tensor_options_kwarg_only)
+    # Skip TensorOptionsArguments. Python side TensorOptions
+    # arguments are created based on different rules - see below.
+    args.extend(func.arguments.post_tensor_options_kwarg_only)
+    args.extend(func.arguments.out)
+
+    input_arg_set = {a.name for a in func.arguments.flat_positional}
+    kwarg_only_set = {a.name for a in func.arguments.flat_kwarg_only}
+    out_arg_set = {a.name for a in func.arguments.out}
+
+    input_args = tuple(map(argument, filter(lambda a: a.name in input_arg_set, args)))
+    input_kwargs = tuple(
+        map(argument, filter(lambda a: a.name in kwarg_only_set, args))
+    )
+    outputs = tuple(map(argument, filter(lambda a: a.name in out_arg_set, args)))
+
+    # Reintroduce the scattered fields of TensorOptions for Python.
+    # Compared to the cpp counterpart, the python arguments have new property
+    # (default_init) and a new argument 'requires_grad', which require some
+    # special handlings.
+    # [old codegen] TODO: because these aren't guaranteed to be 100% faithful
+    # to the original versions in the yaml, this recreation is a potential
+    # source of drift between eager and JIT. Pull this logic out to a shared place.
+
+    has_tensor_input_arg = any(
+        a.type.is_tensor_like() for a in func.arguments.flat_non_out
+    )
+    if any(a.name == "requires_grad" for a in func.schema_order_arguments()):
+        raise ValueError(
+            "argument named requires_grad is reserved, should not explicitly add it in the schema"
+        )
+
+    # [old codegen] this probably won't work if one of the returns is not a tensor,
+    # but it will produce a compile-time error that is obvious.
+    has_tensor_return = any(r.type.is_tensor_like() for r in func.returns)
+
+    name: str = cpp.name(func)
+    is_factory_function = category_override == "factory" or (
+        has_tensor_return and not has_tensor_input_arg
+    )
+    is_like_or_new_function = (
+        category_override in ("new", "like")
+        or name.startswith("new_")
+        or name.endswith("_like")
+    )
+    is_dummy_function = category_override == "dummy"
+
+    tensor_options_args: List[PythonArgument] = []
+    if (is_factory_function or is_like_or_new_function) and not is_dummy_function:
+
+        def topt_default_init(name: str) -> Optional[str]:
+            topt_args = func.arguments.tensor_options
+            if topt_args is None:
+                return None
+            a = getattr(topt_args, name)
+            if a.default is None or a.default == "None":
+                return None
+            return cpp.default_expr(a.default, a.type, symint=False)
+
+        tensor_options_args.append(
+            PythonArgument(
+                name="dtype",
+                type=OptionalType(BaseType(BaseTy.ScalarType)),
+                default="None",
+                default_init=(
+                    None if is_like_or_new_function else topt_default_init("dtype")
+                ),
+            )
+        )
+        tensor_options_args.append(
+            PythonArgument(
+                name="layout",
+                type=OptionalType(BaseType(BaseTy.Layout)),
+                default="None",
+                default_init=(
+                    None if is_like_or_new_function else topt_default_init("layout")
+                ),
+            )
+        )
+        tensor_options_args.append(
+            PythonArgument(
+                name="device",
+                type=OptionalType(BaseType(BaseTy.Device)),
+                default="None",
+                default_init=(
+                    None
+                    if is_like_or_new_function
+                    else (
+                        topt_default_init("device")
+                        or "torch::tensors::get_default_device()"
+                    )
+                ),
+            )
+        )
+        tensor_options_args.append(
+            PythonArgument(
+                name="pin_memory",
+                type=OptionalType(BaseType(BaseTy.bool)),
+                default="False",
+                default_init=None,
+            )
+        )
+        tensor_options_args.append(
+            PythonArgument(
+                name="requires_grad",
+                type=OptionalType(BaseType(BaseTy.bool)),
+                default="False",
+                default_init=None,
+            )
+        )
+
+    returns = PythonReturns(returns=func.returns)
+
+    return PythonSignature(
+        name=str(func.name.name),
+        input_args=input_args,
+        input_kwargs=input_kwargs,
+        output_args=PythonOutArgument.from_outputs(outputs),
+        tensor_options_args=tuple(tensor_options_args),
+        returns=returns,
+        method=method,
+    )
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                          Python Interface
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def structseq_fieldnames(returns: Tuple[Return, ...]) -> List[str]:
+    if len(returns) <= 1 or all(r.name is None for r in returns):
+        return []
+    else:
+        if any(r.name is None for r in returns):
+            # When building on Windows, `PyStructSequence_UnnamedField` could not be
+            # resolved by the linker for some reason, which cause error in building:
+            #
+            # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol
+            # PyStructSequence_UnnamedField
+            #
+            # Thus, at this point in time, we do not support unnamed
+            # fields in structseq; you must either name all fields,
+            # or none of them.
+            raise ValueError("Unnamed field is not supported by codegen")
+
+        return [str(r.name) for r in returns]
+
+
+def argument_type_str_pyi(t: Type) -> str:
+    add_optional = False
+    if isinstance(t, OptionalType):
+        t = t.elem
+        add_optional = True
+
+    if isinstance(t, BaseType):
+        if t.name in [BaseTy.int, BaseTy.DeviceIndex]:
+            ret = "_int"
+        if t.name == BaseTy.SymInt:
+            ret = "Union[_int, SymInt]"
+        elif t.name == BaseTy.float:
+            ret = "_float"
+        elif t.name == BaseTy.str:
+            ret = "str"
+        elif t.name == BaseTy.Scalar:
+            ret = "Union[Number, _complex]"
+        elif t.name == BaseTy.ScalarType:
+            ret = "_dtype"
+        elif t.name == BaseTy.bool:
+            ret = "_bool"
+        elif t.name == BaseTy.QScheme:
+            ret = "_qscheme"
+        elif t.name == BaseTy.Layout:
+            ret = "_layout"
+        elif t.name == BaseTy.Device:
+            ret = "Optional[DeviceLikeType]"
+        elif t.name == BaseTy.MemoryFormat:
+            ret = "memory_format"
+        elif t.name == BaseTy.Dimname:
+            ret = "Union[str, ellipsis, None]"
+        elif t.name == BaseTy.Storage:
+            ret = "Union[Storage, UntypedStorage]"
+        elif t.name in [BaseTy.Tensor, BaseTy.Generator, BaseTy.Stream]:
+            # These python schema type names line up with their function schema names
+            ret = t.name.name
+
+    elif isinstance(t, ListType):
+        if str(t.elem) == "int":
+            ret = "Union[_int, _size]" if t.size is not None else "_size"
+        elif t.is_tensor_like():
+            # TODO: this doesn't seem right...
+            # Tensor?[] currently translates to Optional[Union[Tuple[Tensor, ...], List[Tensor]]]
+            # It should probably translate to   Union[Tuple[Optional[Tensor], ...], List[Optional[Tensor]]]
+            if isinstance(t.elem, OptionalType):
+                add_optional = True
+            ret = (
+                "Union[Tensor, Tuple[Tensor, ...], List[Tensor]]"
+                if t.size is not None
+                else "Union[Tuple[Tensor, ...], List[Tensor]]"
+            )
+        elif str(t.elem) == "float":
+            ret = "Sequence[_float]"
+        elif str(t.elem) == "SymInt" and t.size is not None:
+            elem = argument_type_str_pyi(t.elem)
+            ret = f"Union[{elem}, Sequence[{elem}]]"
+        else:
+            elem = argument_type_str_pyi(t.elem)
+            ret = f"Sequence[{elem}]"
+
+    else:
+        raise RuntimeError(f"unrecognized type {repr(t)}")
+
+    if add_optional:
+        ret = "Optional[" + ret + "]"
+
+    return ret
+
+
+def return_type_str_pyi(t: Type) -> str:
+    # Where arguments are open to accepting Union, return types should return
+    # concrete types
+
+    if isinstance(t, OptionalType):
+        inner = return_type_str_pyi(t.elem)
+        return f"Optional[{inner}]"
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Device:
+            return "_device"
+        elif t.name == BaseTy.Dimname:
+            ret = "Optional[str]"
+        else:
+            return argument_type_str_pyi(t)
+
+    if isinstance(t, ListType):
+        inner = return_type_str_pyi(t.elem)
+        return f"Tuple[{inner}, ...]"
+
+    return argument_type_str_pyi(t)
+
+
+def returns_structseq_pyi(signature: PythonSignature) -> Optional[Tuple[str, str]]:
+    python_returns = [return_type_str_pyi(r.type) for r in signature.returns.returns]
+    structseq_name = signature.name
+    field_names = structseq_fieldnames(signature.returns.returns)
+    if field_names:
+        # These types are structseq objects which act like named NamedTuples, but
+        # the constructor acts like the constructor of tuple. Using typing.NamedTuple
+        # does not allow us to override __init__.
+        field_names_str = ", ".join(repr(name) for name in field_names)
+        seq_type = f"Tuple[{', '.join(python_returns)}]"
+        structseq_def_lines = [
+            f"class {structseq_name}({seq_type}):",
+        ]
+        for name, typ in zip(field_names, python_returns):
+            structseq_def_lines.extend(
+                [
+                    "    @property",
+                    f"    def {name}(self) -> {typ}: ...",
+                ]
+            )
+        structseq_def_lines.extend(
+            [
+                f"    def __new__(cls, sequence: {seq_type}): ...",
+                f"    n_fields: _int = {len(field_names)}",
+                f"    n_sequeunce_fields: _int = {len(field_names)}",
+                "    n_unnamed_fields: _int = 0",
+                "    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing",
+                "",  # add an extra newline
+            ]
+        )
+        structseq_def = "\n".join(structseq_def_lines)
+        # Example:
+        # structseq_def = (
+        #     "class max(Tuple[Tensor, Tensor]):\n"
+        #     "    @property\n"
+        #     "    def values(self) -> Tensor: ...\n"
+        #     "    @property\n"
+        #     "    def indices(self) -> Tensor: ...\n"
+        #     "    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...\n"
+        #     "    n_fields: _int = 2",
+        #     "    n_sequeunce_fields: _int = 2",
+        #     "    n_unnamed_fields: _int = 0",
+        #     "    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing",
+        # )
+        return structseq_name, structseq_def
+    return None
+
+
+def returns_str_pyi(signature: PythonSignature) -> str:
+    field_names = structseq_fieldnames(signature.returns.returns)
+    if field_names:
+        return f"torch.return_types.{signature.name}"
+
+    python_returns = [return_type_str_pyi(r.type) for r in signature.returns.returns]
+    if len(python_returns) > 1:
+        return "Tuple[" + ", ".join(python_returns) + "]"
+    if len(python_returns) == 1:
+        return python_returns[0]
+    return "None"
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                        C++ Function Dispatch
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+# This section provides APIs to generate the code that does C++ function
+# dispatch. The C++ function call is wrapped by a lambda function.
+# For example:
+#
+#    // aten::selu_(Tensor(a!) self) -> Tensor(a!)
+#    auto dispatch_selu_ = [](Tensor self) -> Tensor {
+#      pybind11::gil_scoped_release no_gil;
+#      return at::selu_(self);
+#    };
+#
+# The lambda function's signature follows the C++ signature in common
+# cases, e.g.:
+#
+#   // aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+#   [](const Tensor & self, const Tensor & other, Scalar alpha) -> Tensor
+#
+# For out variant the 'out' argument's type is changed from 'Tensor &'
+# to 'Tensor'. It's because when calling the lambda it passes in the
+# PythonArgParser output '_r.tensor(3)', which is stack allocated object
+# and needs to pass by value. Also see comments in 'dispatch_lambda_return_str()'.
+#
+#   // aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+#   [](Tensor out, const Tensor & self, const Tensor & other, Scalar alpha) -> Tensor
+#
+# For multi-output case it can keep using reference type because the
+# PythonArgParser output has been unpacked to local variables, e.g.:
+#
+#   // aten::max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *,
+#   //     Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+#   [](Tensor & max, Tensor & max_values, const Tensor & self, Dimname dim, bool keepdim) -> std::tuple<Tensor,Tensor>
+#
+# For deprecated python signature, it should follow deprecated python arg order.
+# TODO: This is to keep same byte-for-byte result as the old codegen - maybe unnecessary?
+
+
+def dispatch_lambda_args(
+    ps: PythonSignature, f: NativeFunction, symint: bool = True
+) -> Tuple[DispatchLambdaArgument, ...]:
+    if isinstance(ps, PythonSignatureDeprecated):
+        schema = ps.deprecated_schema
+    else:
+        schema = f.func
+
+    # Start with cpp arguments - dispatch lambda signature always include 'self'
+    cpp_args = cpp.arguments(
+        arguments=schema.arguments,
+        faithful=False,
+        symint=symint,
+        method=False,
+        cpp_no_default_args=f.cpp_no_default_args,
+    )
+    out_args: Set[str] = {a.name for a in schema.arguments.out}
+
+    # Convert from cpp argument to lambda argument
+    def dispatch_lambda_arg(cpp_arg: Binding) -> DispatchLambdaArgument:
+        type_str = cpp_arg.type
+        is_out_arg = cpp_arg.name in out_args
+        if ps.method and cpp_arg.name == "self":
+            # For method's 'self', we can use 'const Tensor &' and simply ignore mutability!
+            type_str = "const at::Tensor &"
+        else:
+            # For other cases we need prevent dangling refs to temps (unless it's
+            # unpacked scattered output)
+            # The reason is explained in the comments above and in 'dispatch_lambda_return_str()'.
+            # TODO: avoid this special handling?
+            ensure_temp_safe = len(out_args) <= 1 or not is_out_arg
+            if ensure_temp_safe:
+                type_str = {
+                    "at::Tensor &": "at::Tensor",
+                }.get(type_str, type_str)
+        return DispatchLambdaArgument(
+            name=cpp_arg.name,
+            type_str=type_str,
+            is_out_arg=is_out_arg,
+        )
+
+    return tuple(map(dispatch_lambda_arg, cpp_args))
+
+
+# [old codegen] XXX: if you got here because of an assertion failure, it doesn't mean
+# it's enough to just extend the list here. Before you do this, make sure
+# to add an appropriate wrap() overload in torch/csrc/autograd/utils/wrap_outputs.h.
+SUPPORTED_RETURN_TYPES = {
+    "at::Tensor",
+    "::std::tuple<at::Tensor,at::Tensor>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,int64_t>",
+    "::std::tuple<at::Tensor,at::Tensor,double,int64_t>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t>",
+    "::std::tuple<at::Tensor,at::Tensor,double,at::Tensor,int64_t>",
+    "::std::tuple<double,int64_t>",
+    "::std::tuple<at::Tensor,::std::vector<at::Tensor>>",
+    "::std::vector<at::Tensor>",
+    # Needed for flash attention forw/backward
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,c10::SymInt,c10::SymInt,at::Tensor,at::Tensor,at::Tensor>",
+    "at::Scalar",
+    "bool",
+    "int64_t",
+    "void*",
+    "void",
+    "at::QScheme",
+    "double",
+    "at::IntArrayRef",
+    "at::ScalarType",
+    "at::Stream",
+}
+
+
+def dispatch_lambda_return_str(f: NativeFunction) -> str:
+    # [old codegen] Remove type annotation (e.g. 'Tensor' rather than 'Tensor &')
+    # because the dispatch lambdas take mutable arguments *by value*, not
+    # by reference. If you then return a reference to such an argument, you
+    # will now have a pointer to a dangling stack entry. Not good.
+    #
+    # You want:
+    #
+    #   auto dispatch_selu_ = [](Tensor self) -> Tensor { ...; return at::selu_(self); };
+    #                                            ^^^^^^
+    #
+    # *not*
+    #
+    #   auto dispatch_selu_ = [](Tensor self) -> Tensor& { ...; return at::selu_(self); };
+    #                                            ^^^^^^^
+    #
+    # (NB: We can't make dispatch_selu_ take Tensor&, because the enclosing
+    # codegen looks like dispatch_selu_(_r.tensor(0)), and you can't take a
+    # mutable reference to temporary.  Maybe we could assign it to a
+    # variable itself.)
+    returns_without_annotation = tuple(
+        Return(r.name, r.type, None) for r in f.func.returns
+    )
+    return_str = cpp.returns_type(returns_without_annotation, symint=True).cpp_type()
+    if return_str not in SUPPORTED_RETURN_TYPES:
+        raise RuntimeError(f"{f.func.name} returns unsupported type {return_str}")
+    return return_str
+
+
+def cpp_dispatch_target(f: NativeFunction) -> str:
+    symint = f.func.has_symint()
+    name = cpp.name(f.func, symint_overload=symint)
+    if Variant.method in f.variants:
+        return f"self.{name}"
+    if Variant.function in f.variants:
+        if has_tensor_options(f) or f.func.name.name.base.endswith("_like"):
+            namespace = "torch"
+        else:
+            namespace = "at"
+        return f"{namespace}::{name}"
+    raise RuntimeError(f"could not dispatch, neither function nor method: {f.func}")
+
+
+def cpp_dispatch_exprs(
+    f: NativeFunction,
+    *,
+    python_signature: Optional[PythonSignature] = None,
+) -> Tuple[str, ...]:
+    cpp_args: Sequence[Binding] = _cpp_signature(f, method=False).arguments()
+
+    exprs: Tuple[str, ...] = tuple()
+    if not isinstance(python_signature, PythonSignatureDeprecated):
+        # By default the exprs are consistent with the C++ signature.
+        exprs = tuple(a.name for a in cpp_args)
+    else:
+        # For deprecated python signature we may need fill in some constants.
+        exprs = tuple(
+            filter(
+                lambda n: n != "out" or f.func.is_out_fn(),
+                python_signature.deprecated_args_exprs,
+            )
+        )
+
+    if Variant.method in f.variants:
+        exprs = tuple(filter("self".__ne__, exprs))
+
+    return exprs
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                     Python / C++ Args Binding
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+# We explicitly enumerate the PythonArgParser unpacking methods for all
+# supported types. This might be more verbose than necessary, partially
+# because of the irregularity of unpacking method naming, partially
+# because we want to mimic the old codegen behavior - to reject
+# unexpected and/or unsupported cases which the old codegen rejects.
+# For certain cases it is intentionally more restrictive than necessary,
+# e.g.: it doesn't accepts doublelist with definite size.
+def arg_parser_unpack_method(
+    t: Type, default: Optional[str], default_init: Optional[str], *, symint: bool = True
+) -> str:
+    has_default_init = default_init is not None
+    if has_default_init and str(t) not in (
+        "ScalarType?",
+        "ScalarType",
+        "Device",
+        "Device?",
+        "Layout",
+        "Layout?",
+        "bool",
+        "bool?",
+    ):
+        raise RuntimeError(f"type '{t}' does not supported unpacking with default")
+
+    if isinstance(t, BaseType):
+        if t.name in [
+            BaseTy.Tensor,
+            BaseTy.Stream,
+            BaseTy.Storage,
+            BaseTy.Scalar,
+            BaseTy.Dimname,
+        ]:
+            # These unpack methods line up with their schema names
+            return t.name.name.lower()
+        elif t.name == BaseTy.ScalarType:
+            return "scalartypeWithDefault" if has_default_init else "scalartype"
+        elif t.name == BaseTy.Device:
+            return "deviceWithDefault" if has_default_init else "device"
+        elif t.name == BaseTy.DeviceIndex:
+            return "toInt64"
+        elif t.name == BaseTy.int:
+            return "toInt64"
+        elif t.name == BaseTy.SymInt:
+            return "toSymInt" if symint else "toInt64"
+        elif t.name == BaseTy.bool:
+            return "toBoolWithDefault" if has_default_init else "toBool"
+        elif t.name == BaseTy.float:
+            return "toDouble"
+        elif t.name == BaseTy.str:
+            return "stringView"
+        elif t.name == BaseTy.Layout:
+            return "layoutWithDefault" if has_default_init else "layout"
+        elif t.name == BaseTy.MemoryFormat:
+            return "memoryformat"
+
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == "Tensor":
+            return "optionalTensor"
+        elif str(t.elem) == "Generator":
+            return "generator"
+        elif str(t.elem) == "Dimname[]":
+            return "toDimnameListOptional"
+        elif not has_default_init and default in (None, "None", "c10::nullopt"):
+            # If default is None: append 'Optional' to elem's unpacking method
+            return (
+                arg_parser_unpack_method(t.elem, None, None, symint=symint) + "Optional"
+            )
+        else:
+            # Otherwise, load as underlying type with default
+            return arg_parser_unpack_method(
+                t.elem, default, default_init, symint=symint
+            )
+
+    elif isinstance(t, ListType):
+        if str(t.elem) == "Tensor":
+            # accept and use definite size
+            return f"tensorlist_n<{t.size}>" if t.size is not None else "tensorlist"
+        elif str(t.elem) == "Tensor?":
+            return "list_of_optional_tensors"
+        elif str(t.elem) == "Dimname":
+            # accept definite size
+            return "dimnamelist"
+        elif str(t.elem) == "int":
+            # accept definite size
+            return "intlist"
+        elif str(t.elem) == "float":
+            return "doublelist"
+        elif str(t.elem) == "SymInt":
+            # accept definite size
+            return "symintlist" if symint else "intlist"
+        elif str(t.elem) == "Scalar":
+            return "scalarlist"
+    raise RuntimeError(f"type '{t}' is not supported by PythonArgParser")
+
+
+# Return RHS expression for python argument using PythonArgParser output.
+# e.g. for arg name 'foo', arg type 'bool', arg_index = 2, returns '_r.toBool(2)'
+def arg_parser_output_expr(
+    arg_index: int, a: PythonArgument, *, symint: bool = True
+) -> PythonArgParserOutputExpr:
+    has_default = a.default_init is not None
+    unpack_method = arg_parser_unpack_method(
+        t=a.type, default=a.default, default_init=a.default_init, symint=symint
+    )
+    default = f", {a.default_init}" if has_default else ""
+    expr = f"_r.{unpack_method}({arg_index}{default})"
+
+    return PythonArgParserOutputExpr(
+        name=a.name,
+        expr=expr,
+        index=arg_index,
+        argument=a,
+    )
+
+
+# Returns a map with key = arg_name and value = PythonArgParserOutputExpr.
+def arg_parser_output_exprs(
+    ps: PythonSignature, f: NativeFunction, *, symint: bool = True
+) -> Dict[str, PythonArgParserOutputExpr]:
+    return {
+        e.name: e
+        for i, a in enumerate(ps.arguments())
+        for e in (arg_parser_output_expr(i, a, symint=symint),)
+    }
+
+
+# argument name to type for scattered tensor options fields
+TENSOR_OPTIONS_FIELDS = {
+    "dtype": "ScalarType?",
+    "device": "Device?",
+    "layout": "Layout?",
+    "pin_memory": "bool?",
+    "requires_grad": "bool?",
+}
+
+
+# bind arg parser outputs (python args) with dispatch lambda arguments (c++ args).
+def dispatch_lambda_exprs(
+    ps: PythonSignature, f: NativeFunction, *, symint: bool = True
+) -> DispatchLambdaArgumentExprs:
+    # This method is to bind 'arg_parser_outputs' and 'lambda_args' by producing
+    # 'inits' and 'lambda_args_exprs' for each lambda argument using arg parser
+    # outputs.
+    arg_parser_outputs = arg_parser_output_exprs(ps, f, symint=symint)
+    lambda_args = dispatch_lambda_args(ps, f, symint=symint)
+    inits: List[str] = []
+    lambda_args_exprs: Dict[str, str] = {}
+
+    has_toptions = has_tensor_options(f)
+
+    # 1. special inits/unpacking to provide binding exprs for lambda arguments.
+    for a in ps.arguments(skip_tensor_options=True):
+        name = a.name
+        arg_parser_expr = arg_parser_outputs[a.name].expr
+
+        if has_toptions and name == "self":
+            # TODO: why this needs to be special case?
+            inits.extend(
+                [
+                    f"auto self = {arg_parser_expr};",
+                ]
+            )
+            lambda_args_exprs[name] = name
+        elif (
+            isinstance(a, PythonOutArgument)
+            and len(a.outputs) > 1
+            and f.func.is_out_fn()
+        ):
+            inits.extend(
+                [
+                    f"auto out = {arg_parser_expr};",
+                ]
+            )
+            for i, out_arg in enumerate(a.outputs):
+                lambda_args_exprs[out_arg.name] = f"out[{i}]"
+        elif str(a.type) == "Dimname[]?":
+            # [old codegen]
+            # TODO: make this part of something more general, or get rid of it.
+            # optional<ArrayRef<T>> are special. The PythonArgParser returns an
+            # optional<vector<T>>, which cannot be implicitly converted to
+            # optional<ArrayRef<T>>. One needs to unwrap the optional and rewrap.
+            inits.extend(
+                [
+                    f"auto __{name} = {arg_parser_expr};",
+                    f"c10::optional<DimnameList> {name} = __{name} ? c10::make_optional(DimnameList(__{name}.value())) : c10::nullopt;",  # noqa: B950
+                ]
+            )
+            lambda_args_exprs[name] = name
+        else:
+            # default case - directly using PythonArgParser output expr
+            lambda_args_exprs[name] = arg_parser_expr
+
+    # method's self is passed directly to python binding, rather than parsed
+    if ps.method:
+        lambda_args_exprs["self"] = "self"
+
+    # 2. special packing/checking for TensorOptions.
+    tensor_options_args_names = [a.name for a in ps.tensor_options_args]
+    if has_toptions:
+        if f.func.is_out_fn():
+            raise RuntimeError(f"{f.func}: tensor options with output arg")
+        for a in ps.tensor_options_args:
+            if a.name not in TENSOR_OPTIONS_FIELDS:
+                raise RuntimeError(
+                    f"{f.func}: unrecognized tensor options field '{a.name}' in python binding arguments"
+                )
+            if str(a.type) != TENSOR_OPTIONS_FIELDS.get(a.name):
+                raise RuntimeError(
+                    f"{f.func}: unrecognized type '{str(a.type)}' for tensor options field '{a.name}'"
+                )
+        if not all(
+            a in tensor_options_args_names for a in TENSOR_OPTIONS_FIELDS.keys()
+        ):
+            raise RuntimeError(
+                f"{f.func}: incomplete tensor options args: {tensor_options_args_names}"
+            )
+
+        inits.append(
+            f"""\
+const auto options = TensorOptions()
+    .dtype({arg_parser_outputs['dtype'].expr})
+    .device({arg_parser_outputs['device'].expr})
+    .layout({arg_parser_outputs['layout'].expr})
+    .requires_grad({arg_parser_outputs['requires_grad'].expr})
+    .pinned_memory({arg_parser_outputs['pin_memory'].expr});
+torch::utils::maybe_initialize_device(options);
+"""
+        )
+        lambda_args_exprs["options"] = "options"
+
+    # 3. special case - access scattered TensorOptions fields without packing
+    # TODO: maybe move to the generator side as it's not related to binding.
+    if not has_toptions and tensor_options_args_names:
+        if "dtype" in tensor_options_args_names:
+            # we're an output-arg variant, check these args against output tensor
+            if not f.func.is_out_fn():
+                raise RuntimeError(
+                    f"{f.func}: dtype in tensor_options_args without output arg"
+                )
+            if not all(a in tensor_options_args_names for a in ("layout", "device")):
+                raise RuntimeError(
+                    f"{f.func}: incomplete tensor options for output check"
+                )
+
+            inits.append(
+                f"""\
+check_out_type_matches({arg_parser_outputs['out'].expr}, {arg_parser_outputs['dtype'].expr},
+                       {arg_parser_outputs['dtype'].is_none_expr}, {arg_parser_outputs['layout'].expr},
+                       {arg_parser_outputs['device'].expr}, {arg_parser_outputs['device'].is_none_expr});
+"""
+            )
+        # we'll set requires_grad on outgoing tensor
+        if "requires_grad" not in tensor_options_args_names:
+            raise RuntimeError(
+                f'{f.func}: expected "requires_grad" in tensor_options_args absent, but found [{tensor_options_args_names}]'
+            )
+
+    return DispatchLambdaArgumentExprs(
+        exprs=tuple(lambda_args_exprs[a.name] for a in lambda_args),
+        inits=inits,
+    )
diff --git a/MLPY/Lib/site-packages/torchgen/api/structured.py b/MLPY/Lib/site-packages/torchgen/api/structured.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a83b65d9dcbe9211c4f5cbd2b16f4f3f1506ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/structured.py
@@ -0,0 +1,157 @@
+from typing import List, Union
+
+from torchgen.api import cpp
+
+from torchgen.api.types import (
+    ArgName,
+    ArrayRefCType,
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    dimnameListT,
+    intArrayRefT,
+    iOptTensorListRefT,
+    iTensorListRefT,
+    NamedCType,
+    OptionalCType,
+    optionalIntArrayRefT,
+    optionalScalarRefT,
+    optionalTensorRefT,
+    scalarT,
+    tensorT,
+)
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunctionsGroup,
+    OptionalType,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import assert_never
+
+# This file describes the translation of JIT schema to the structured functions API.
+# This is similar to native API, but a number of historical problems with native
+# API have been fixed.
+
+
+# Translation of types occurring in JIT arguments to a C++ argument type.
+# NB: For now, mutable doesn't do anything; but it could if we make
+# some more nominal types
+def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> NamedCType:
+    # If it's a value type, do the value type translation
+    # NB: structured kernels ALWAYS have symint off, since they involve actual
+    # kernels that require real ints.  The one exception is the
+    # CompositeExplicitAutograd and the meta function (which could
+    # hypothetically be SymInt), but for simplicity we plan for these to just
+    # be handled in Python
+    r = cpp.valuetype_type(t, symint=False, binds=binds)
+    if r is not None:
+        return r
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            return NamedCType(binds, ConstRefCType(BaseCType(tensorT)))
+        elif t.name == BaseTy.Scalar:
+            return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+        else:
+            raise AssertionError(f"base type should have been value type {t}")
+    elif isinstance(t, OptionalType):
+        if t.elem == BaseType(BaseTy.Tensor):
+            return NamedCType(binds, BaseCType(optionalTensorRefT))
+        elif t.elem == BaseType(BaseTy.Scalar):
+            return NamedCType(binds, BaseCType(optionalScalarRefT))
+        elif isinstance(t.elem, ListType) and str(t.elem.elem) == "int":
+            return NamedCType(binds, BaseCType(optionalIntArrayRefT))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        if t.elem == BaseType(BaseTy.Tensor):
+            return NamedCType(binds, ConstRefCType(BaseCType(iTensorListRefT)))
+        elif t.elem == OptionalType(BaseType(BaseTy.Tensor)):
+            return NamedCType(binds, BaseCType(iOptTensorListRefT))
+        # TODO: delete these special cases; see torchgen.api.cpp--these
+        # must be changed in tandem, but there are problems; see
+        # https://github.com/pytorch/pytorch/pull/51485
+        elif str(t.elem) == "int":
+            return NamedCType(binds, BaseCType(intArrayRefT))
+        elif str(t.elem) == "Dimname":
+            return NamedCType(binds, BaseCType(dimnameListT))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, ArrayRefCType(elem.type))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
+
+
+# returns_type intentionally omitted, because structured kernels never "return";
+# instead, they always indirectly report their outputs (in the case of a meta
+# function, by calling set_output; in the case of an impl function, by writing
+# directly into the provided out argument).
+
+
+# Structured kernels are never defaulted
+def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[Binding]:
+    if isinstance(a, Argument):
+        return [
+            Binding(
+                nctype=argument_type(a, binds=a.name),
+                name=a.name,
+                default=None,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, SelfArgument):
+        return argument(a.argument)
+    elif isinstance(a, TensorOptionsArguments):
+        raise AssertionError("structured kernels don't support TensorOptions yet")
+    else:
+        assert_never(a)
+
+
+def impl_arguments(g: NativeFunctionsGroup) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+
+    if g.out.precomputed:
+        # A list of parameters for the impl function with
+        # certain parameters replaced with precomputed counterparts
+        # as specified in native_functions.yaml.
+        non_out_args_replaced: List[
+            Union[Argument, TensorOptionsArguments, SelfArgument]
+        ] = []
+        for a in g.out.func.arguments.non_out:
+            if isinstance(a, Argument) and a.name in g.out.precomputed.replace:
+                # If a is in precompute.replace, append the parameters
+                # that should replace it onto non_out_args_replaced.
+                non_out_args_replaced.extend(g.out.precomputed.replace[a.name])
+            else:
+                # If not, push a as it is.
+                non_out_args_replaced.append(a)
+
+        args.extend(non_out_args_replaced)
+        # g.out.precomputed.add is the list of parameters that are added
+        # without replacement after the non out args and just before the out args
+        args.extend(g.out.precomputed.add)
+    else:
+        args.extend(g.out.func.arguments.non_out)
+
+    args.extend(g.out.func.arguments.out)
+    return [r for arg in args for r in argument(arg)]
+
+
+def meta_arguments(g: NativeFunctionsGroup) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    args.extend(g.functional.func.arguments.non_out)
+    return [r for arg in args for r in argument(arg)]
+
+
+def out_arguments(g: NativeFunctionsGroup) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    args.extend(g.out.func.arguments.out)
+    return [r for arg in args for r in argument(arg)]
diff --git a/MLPY/Lib/site-packages/torchgen/api/translate.py b/MLPY/Lib/site-packages/torchgen/api/translate.py
new file mode 100644
index 0000000000000000000000000000000000000000..7824446f4b6018f0a6eb707438553dc453d43e54
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/translate.py
@@ -0,0 +1,430 @@
+from typing import Dict, List, NoReturn, Sequence, Union
+
+from torchgen.api.types import (
+    ArrayRefCType,
+    BaseCType,
+    Binding,
+    boolT,
+    ConstRefCType,
+    deviceT,
+    Expr,
+    intArrayRefT,
+    iOptTensorListRefT,
+    layoutT,
+    ListCType,
+    longT,
+    memoryFormatT,
+    MutRefCType,
+    NamedCType,
+    opmath_t,
+    OptionalCType,
+    optionalIntArrayRefT,
+    optionalScalarRefT,
+    optionalSymIntArrayRefT,
+    optionalTensorRefT,
+    scalar_t,
+    scalarT,
+    scalarTypeT,
+    SpecialArgName,
+    symIntArrayRefT,
+    SymIntT,
+    tensorOptionsT,
+    tensorT,
+    VectorCType,
+)
+
+# This file implements a small program synthesis engine that implements
+# conversions between one API to another.
+#
+# The key data type in this file in NamedCType, short for Named C++ semantic type.  A NamedCType
+# represents a C++ type, plus semantic information about what it represents.
+# For example, consider the argument "bool pin_memory"; its normal C++ type is
+# "bool", but its C++ semantic type also keeps track that this represents a
+# "pin_memory"; you can't just use a random other boolean in a context where you
+# need a "pin_memory"!
+#
+# The translator takes a list of needed NamedCTypes, and then figures out how
+# to construct expressions with these NamedCTypes from the given bindings.  Many
+# of these expressions are trivial (I need a Tensor other; there's a Tensor
+# other scope); others are more nontrivial and may require packing/unpacking.
+# Some examples of non-trivial action:
+#
+#   - Need the "dtype" binding?  Well, maybe "dtype" isn't available
+#     in the context, instead, "options" is, and you need to extract
+#     it from there.  (Gather)
+#
+#   - Need the "context" binding?  Well, maybe "context" isn't available
+#     in the context, and you need to construct it from "dtype", "device",
+#     etc.  (Scatter)
+#
+#   - Need the "memory_format" binding?  Well, actually, it's available
+#     from both "memory_format" and "options", so you had better make sure
+#     they are consistent.  (Join)
+
+options_ctype = NamedCType("options", ConstRefCType(BaseCType(tensorOptionsT)))
+
+out_tensor_ctype = NamedCType("out", ConstRefCType(BaseCType(tensorT)))
+
+longVec_ctype = VectorCType(BaseCType(longT))
+longSymVec_ctype = VectorCType(BaseCType(SymIntT))
+optionalLongVec_ctype = OptionalCType(VectorCType(BaseCType(longT)))
+optionalScalar_ctype = OptionalCType(BaseCType(scalarT))
+optionalTensor_ctype = OptionalCType(BaseCType(tensorT))
+
+
+class UnsatError(RuntimeError):
+    pass
+
+
+# Given a set of in-scope bindings and a set of target bindings, synthesize
+# a list of expressions that uses only the in-scope bindings (bindings) that
+# have all of the types of goals.  You may want to use this function if
+# you're generating code for a function like:
+#
+#   void f({args}) {
+#     g({exprs}); // g is a different API
+#   }
+#
+# and you need to generate "exprs".
+#
+# Typically, a list of Bindings is convenient to get (you usually call something
+# like arguments() to get them); but technically you only need less information:
+# for 'bindings' an (un-ordered) list of Exprs is sufficient; similarly, for
+# 'goals', an (ordered) list of NamedCType goals is sufficient.  If you are doing
+# something more complicated, e.g., tracking the set of bindings in a context,
+# you may find using these smaller types more convenient.
+def translate(
+    bindings: Sequence[Union[Expr, Binding]],
+    goals: Sequence[Union[NamedCType, Binding]],
+    *,
+    method: bool = False,
+    allow_expensive_conversions: bool = False,
+) -> List[Expr]:
+    binding_exprs: List[Expr] = []
+    for b in bindings:
+        if isinstance(b, Binding):
+            binding_exprs.append(
+                Expr(
+                    expr=b.name,
+                    type=b.nctype,
+                )
+            )
+        else:
+            binding_exprs.append(b)
+
+    goal_ctypes: List[NamedCType] = []
+    for g in goals:
+        if isinstance(g, Binding):
+            goal_ctypes.append(g.nctype)
+        else:
+            goal_ctypes.append(g)
+
+    # Add all the bindings to the context
+    ctx: Dict[NamedCType, str] = {}
+    for b in binding_exprs:
+        ctx[b.type] = b.expr
+
+        # While we're at it, do some simple forward inference, looking through
+        # constructors.
+        #
+        # NB: When should you do forward inference versus backward inference?
+        # The general idea:
+        #
+        #   - Backward inference WHEN the goal gets smaller
+        #   - Forward inference WHEN the hypothesis gets smaller
+        #
+        # This helps ensure termination: backward inference starts with a goal
+        # and tries to make it simpler and simpler until it's trivial; if the
+        # goal can grow in size, we blow up to a really huge goal size.
+        # Similarly, with forward inference we take hypotheses and decompose
+        # them into simpler hypotheses; if hypotheses could expand in size,
+        # we also have potential nontermination.  (In the code below, forward
+        # inference is only ever carried out at a single step, but you could
+        # imagine repeated application of forward inference being profitable.)
+        #
+        # A good starting point in the literature for exploring more about proof
+        # search are these lecture notes
+        # https://www.cs.cmu.edu/~fp/courses/oregon-m10/04-focusing.pdf
+        #
+        # TODO: My kingdom for a pattern matcher
+        # https://www.python.org/dev/peps/pep-0634/
+        #
+        # TODO: This could get us in recomputation trouble if b.expr is nontrivial.
+        # Fix this by implementing some sort of sharing so that if multiple
+        # goals share the same expression, we only compute it once.  This seems
+        # to matter in practice as compiler is often unwilling to CSE nontrivial
+        # expressions like scalar.to<scalar_t>()
+        t = b.type
+        if (
+            isinstance(t, ConstRefCType)
+            and isinstance(t.elem, OptionalCType)
+            and isinstance(t.elem.elem, BaseCType)
+            and str(t.elem.elem.type) == "at::Tensor"
+        ):
+            ctx[
+                NamedCType(t.elem.elem.name, ConstRefCType(BaseCType(tensorT)))
+            ] = f"({b.expr}.has_value() ? *{b.expr} : at::Tensor())"
+
+        if t.type == ConstRefCType(OptionalCType(BaseCType(tensorT))):
+            ctx[
+                NamedCType(t.name, BaseCType(optionalTensorRefT))
+            ] = f"(({b.expr}.has_value() && (*{b.expr}).defined()) ? at::OptionalTensorRef(*{b.expr}) : at::OptionalTensorRef())"
+
+        if t.type == ConstRefCType(BaseCType(scalarT)):
+            ctx[NamedCType(t.name, BaseCType(opmath_t))] = f"({b.expr}).to<opmath_t>()"
+
+        if t.type == ConstRefCType(OptionalCType(BaseCType(scalarT))):
+            ctx[
+                NamedCType(t.name, BaseCType(optionalScalarRefT))
+            ] = f"({b.expr}.has_value() ? at::OptionalScalarRef(&({b.expr}.value())) : at::OptionalScalarRef())"
+
+        if t.type == BaseCType(scalar_t):
+            ctx[
+                NamedCType(t.name, BaseCType(opmath_t))
+            ] = f"static_cast<opmath_t>({b.expr})"
+
+        # [Note: IOptTensorListRef]
+        if t.type == ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT)))):
+            ctx[
+                NamedCType(t.name, BaseCType(iOptTensorListRefT))
+            ] = f"at::IOptTensorListRef({b.expr})"
+
+    # Add implicit bindings if the generated code is inside a Tensor method
+    if method:
+        ctx[
+            NamedCType("self", MutRefCType(BaseCType(tensorT)))
+        ] = "const_cast<Tensor&>(*this)"
+        ctx[
+            NamedCType("self", ConstRefCType(BaseCType(tensorT)))
+        ] = "const_cast<Tensor&>(*this)"
+        # This is better!  Byte-for-byte compat
+        # ctx[NamedCType("self", ConstRefCType(BaseCType(tensorT)))] = "*this"
+
+    def unsat(goal: NamedCType) -> NoReturn:
+        ctx_desc = "\n".join(
+            f"  {t.cpp_type()} {t.name}; // {e}" for t, e in ctx.items()
+        )
+        raise UnsatError(
+            f"""
+Failed to synthesize the expression "{goal.cpp_type()} {goal.name}".
+When I failed, the following bindings were available in the context:
+
+{ctx_desc}
+
+This probably means there is a missing rule in the rules of torchgen.api.translate.
+Check this module for more information.
+"""
+        )
+
+    # A shitty backtracking search implementation.  It's shitty because it
+    # does backtracking via stack (bad idea!) and for the most part tries to
+    # avoid backtracking.  In particular, if
+    # direct=True, we won't try to do any fancy synthesis, just trivial
+    # conversions (e.g., "T a" is OK for "const T& a").  So all of the
+    # existing rules in this function simply try to solve immediately,
+    # and bail if things don't work out.
+    def solve(goal: NamedCType, *, direct: bool) -> str:
+        def direct_solve(goal: NamedCType) -> str:
+            return solve(goal, direct=True)
+
+        if goal in ctx:
+            # Trivial
+            return ctx[goal]
+
+        # const & is satisfied with mutable &
+        if isinstance(goal.type, ConstRefCType):
+            try:
+                # WARNING: not strictly decreasing; be careful not
+                # to add a direct conversion that goes satisfies
+                # mutable& with const&
+                return solve(
+                    NamedCType(goal.name, MutRefCType(goal.type.elem)), direct=direct
+                )
+            except UnsatError:
+                pass
+
+        # mutable & is satisfied with value
+        if isinstance(goal.type, MutRefCType):
+            try:
+                return solve(NamedCType(goal.name, goal.type.elem), direct=direct)
+            except UnsatError:
+                pass
+
+        # TODO: These are referentially equal, shouldn't have to do this;
+        # ensuring we don't use type synonym IntArrayRef in codegen would
+        # help
+        if goal.type == ArrayRefCType(BaseCType(longT)):
+            return solve(NamedCType(goal.name, BaseCType(intArrayRefT)), direct=direct)
+
+        if direct:
+            unsat(goal)
+
+        # For now, all of these rules are mutually exclusive.
+        if goal == NamedCType("memory_format", OptionalCType(BaseCType(memoryFormatT))):
+            memory_format = direct_solve(
+                NamedCType(
+                    SpecialArgName.possibly_redundant_memory_format,
+                    OptionalCType(BaseCType(memoryFormatT)),
+                )
+            )
+            # No need to join "memory_format" and "options" if the target API takes "options" directly.
+            # Otherwise it will cause the redundant memory_format error.
+            if options_ctype in goal_ctypes:
+                return memory_format
+            try:
+                options = direct_solve(options_ctype)
+                return f"c10::impl::check_tensor_options_and_extract_memory_format({options}, {memory_format})"
+            except UnsatError:
+                return memory_format
+        elif goal == NamedCType("options", BaseCType(tensorOptionsT)):
+            dtype = direct_solve(
+                NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT)))
+            )
+            pin_memory = direct_solve(
+                NamedCType("pin_memory", OptionalCType(BaseCType(boolT)))
+            )
+            device = direct_solve(
+                NamedCType("device", OptionalCType(BaseCType(deviceT)))
+            )
+            layout = direct_solve(
+                NamedCType("layout", OptionalCType(BaseCType(layoutT)))
+            )
+            return f"TensorOptions().dtype({dtype}).layout({layout}).device({device}).pinned_memory({pin_memory})"
+
+        elif goal == NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))):
+            try:
+                options = direct_solve(options_ctype)
+                return f"c10::optTypeMetaToScalarType({options}.dtype_opt())"
+            except UnsatError:
+                out_tensor = direct_solve(out_tensor_ctype)
+                return f"{out_tensor}.scalar_type()"
+
+        elif goal == NamedCType("layout", OptionalCType(BaseCType(layoutT))):
+            try:
+                options = direct_solve(options_ctype)
+                return f"{options}.layout_opt()"
+            except UnsatError:
+                out_tensor = direct_solve(out_tensor_ctype)
+                return f"{out_tensor}.layout()"
+
+        elif goal == NamedCType("device", OptionalCType(BaseCType(deviceT))):
+            try:
+                options = direct_solve(options_ctype)
+                return f"{options}.device_opt()"
+            except UnsatError:
+                out_tensor = direct_solve(out_tensor_ctype)
+                return f"{out_tensor}.device()"
+
+        elif goal == NamedCType("pin_memory", OptionalCType(BaseCType(boolT))):
+            try:
+                options = direct_solve(options_ctype)
+                return f"{options}.pinned_memory_opt()"
+            except UnsatError:
+                # If we're calling a factory op from its out= variant,
+                # We don't actually care about the value of pin_memory.
+                out_tensor = direct_solve(out_tensor_ctype)
+                return "c10::nullopt"
+
+        # We can always do translations from value types to reference types, like vector<int> -> IntArrayRef
+        elif goal.type == BaseCType(intArrayRefT):
+            try:
+                return direct_solve(NamedCType(goal.name, longVec_ctype))
+            except UnsatError:
+                # We can also go SymIntArrayRef -> IntArrayRef
+                symIntArrayRef_type = direct_solve(
+                    NamedCType(goal.name, BaseCType(symIntArrayRefT))
+                )
+                return f"C10_AS_INTARRAYREF_SLOW({symIntArrayRef_type})"
+        elif goal.type == BaseCType(symIntArrayRefT):
+            try:
+                r = direct_solve(NamedCType(goal.name, BaseCType(intArrayRefT)))
+                return f"c10::fromIntArrayRefSlow({r})"
+            except UnsatError:
+                return direct_solve(NamedCType(goal.name, longSymVec_ctype))
+        elif goal.type == BaseCType(SymIntT):
+            return direct_solve(NamedCType(goal.name, BaseCType(longT)))
+        elif goal.type == OptionalCType(BaseCType(SymIntT)):
+            argname = direct_solve(
+                NamedCType(goal.name, OptionalCType(BaseCType(longT)))
+            )
+            return f"{argname}.has_value() ? c10::make_optional(c10::SymInt(*{argname})) : c10::nullopt"
+        elif goal.type == BaseCType(longT):
+            symInt_type = direct_solve(NamedCType(goal.name, BaseCType(SymIntT)))
+            return f"{symInt_type}.guard_int(__FILE__, __LINE__)"
+        elif goal.type == OptionalCType(BaseCType(longT)):
+            argname = direct_solve(
+                NamedCType(goal.name, OptionalCType(BaseCType(SymIntT)))
+            )
+            return f"{argname}.has_value() ? c10::make_optional({argname}->guard_int(__FILE__, __LINE__)) : c10::nullopt"
+        elif goal.type == BaseCType(optionalIntArrayRefT):
+            try:
+                return direct_solve(NamedCType(goal.name, optionalLongVec_ctype))
+            except UnsatError:
+                argname = direct_solve(
+                    NamedCType(goal.name, BaseCType(optionalSymIntArrayRefT))
+                )
+                return f"{argname}.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*{argname})) : c10::nullopt"
+        elif goal.type == BaseCType(optionalSymIntArrayRefT):
+            # TODO: You might also want to solve this from longSymVec_ctype or
+            # an optional version of it
+            argname = direct_solve(
+                NamedCType(goal.name, BaseCType(optionalIntArrayRefT))
+            )
+            return f"{argname}.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*{argname})) : c10::nullopt"
+        elif goal.type == BaseCType(optionalScalarRefT):
+            return direct_solve(NamedCType(goal.name, optionalScalar_ctype))
+        elif goal.type == BaseCType(optionalTensorRefT):
+            return direct_solve(NamedCType(goal.name, optionalTensor_ctype))
+
+        # Note [translation from C++ reference to value types]
+        # The below cases are all for when we have an argument with a reference type,
+        # and a corresponding goal with a value type.
+        # These are needed when we populate the inputs to a lambda capture and we need
+        # to guarantee the lifetime of each captured argument.
+        # We guard it with an explicit kwarg because converting to a value type is expensive
+        # (O(n)) to convert from IntArrayRef to vector<int>),
+        # so the caller of translate() should be explicit that they need it.
+        if allow_expensive_conversions:
+            if goal.type == VectorCType(BaseCType(longT)):
+                intArrayRef_ctype = NamedCType(goal.name, BaseCType(intArrayRefT))
+                argname = direct_solve(intArrayRef_ctype)
+                return f"{argname}.vec()"
+            if goal.type == VectorCType(BaseCType(SymIntT)):
+                symIntArrayRef_ctype = NamedCType(goal.name, BaseCType(symIntArrayRefT))
+                argname = direct_solve(symIntArrayRef_ctype)
+                return f"{argname}.vec()"
+            elif goal.type == OptionalCType(VectorCType(BaseCType(longT))):
+                optionalIntArrayRef_ctype = NamedCType(
+                    goal.name, BaseCType(optionalIntArrayRefT)
+                )
+                argname = direct_solve(optionalIntArrayRef_ctype)
+                return f"{argname}.has_value() ? c10::make_optional({argname}->vec()) : c10::nullopt"
+            elif goal.type == OptionalCType(BaseCType(scalarT)):
+                optionalScalarRef_ctype = NamedCType(
+                    goal.name, BaseCType(optionalScalarRefT)
+                )
+                argname = direct_solve(optionalScalarRef_ctype)
+                return f"{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt"
+            elif goal.type == OptionalCType(BaseCType(scalarT)):
+                optionalTensorRef_ctype = NamedCType(
+                    goal.name, BaseCType(optionalTensorRefT)
+                )
+                argname = direct_solve(optionalTensorRef_ctype)
+                return f"{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt"
+            # Technically, we also need to handle cases of C++ containers holding reference types.
+            # But there currently aren't any ops that require lambda capture codegen
+            # With arguments like std::vector<IntArrayRef>.
+            # If that changes, we'll have to add the translation here.
+
+        # We allow const casting on tensors, since const-correctness is a bit broken for at::Tensor.
+        # We could probably generalize this to non-tensor types too.
+        if goal.type == MutRefCType(BaseCType(tensorT)):
+            const_ref_tensor_ctype = NamedCType(
+                goal.name, ConstRefCType(BaseCType(tensorT))
+            )
+            argname = direct_solve(const_ref_tensor_ctype)
+            return f"const_cast<Tensor&>({argname})"
+
+        unsat(goal)
+
+    return [Expr(solve(g, direct=False), g) for g in goal_ctypes]
diff --git a/MLPY/Lib/site-packages/torchgen/api/types/__init__.py b/MLPY/Lib/site-packages/torchgen/api/types/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..19ba90c31f9e12d0ceee8850ed2003f3c87b4e1b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/types/__init__.py
@@ -0,0 +1,3 @@
+from .types import *
+from .types_base import *
+from .signatures import *  # isort:skip
diff --git a/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dc9af1fdc571c17112d7854c2f22b7bafa2b061
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/signatures.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/signatures.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..551b7594110e252b4792d255222d4a0c63b2ca87
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/signatures.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/types.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/types.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3b56056eb98c774131b25dbb18c658e6380ad2c
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/types.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/types_base.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/types_base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2623b96b2371f683296cb5f5106fbe932cb01eb6
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/api/types/__pycache__/types_base.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/api/types/signatures.py b/MLPY/Lib/site-packages/torchgen/api/types/signatures.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5716fea645da0799d3e994899be69a6086b28ab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/types/signatures.py
@@ -0,0 +1,423 @@
+from dataclasses import dataclass
+
+from typing import Iterator, List, Optional, Sequence, Set, Tuple, Union
+
+from torchgen.model import (
+    BackendIndex,
+    FunctionSchema,
+    NativeFunction,
+    NativeFunctionsGroup,
+    NativeFunctionsViewGroup,
+)
+
+from .types_base import Binding, CType, Expr
+
+
+@dataclass(frozen=True)
+class CppSignature:
+    """
+    A CppSignature represents a single overload in the C++ API.  For
+    any given function schema, there may be multiple CppSignatures
+    corresponding to it, based on how we desugar to C++.  See also
+    CppSignatureGroup.
+    """
+
+    # The schema this signature is derived from
+    func: FunctionSchema
+
+    # Is this a C++ signature for a method, i.e. Tensor::my_op(...)?
+    method: bool
+
+    # Is this a faithful C++ signature (i.e. following the JIT schema) or a convenience API
+    # (i.e. with a potential TensorOptions argument and out arguments in the front)
+    faithful: bool
+
+    # Is this a symint C++ signature.  For BC reasons, functions that take
+    # SymInts still present as int64_t in C++, and the SymInt variant is
+    # offered at a different overload name
+    #
+    # NB: If a function RETURNS a SymInt, this is ALWAYS false
+    symint: bool
+
+    # The set of C++ arguments which should not have defaults applied to them
+    cpp_no_default_args: Set[str]
+
+    # Is this a fallback C++ binding?  Fallback bindings are enabled by
+    # manual_cpp_binding: True and are alternate, non-public API that
+    # lets manual C++ binding implementors access the binding that would
+    # have been automatically generated
+    fallback_binding: bool = False
+
+    # Return the unpacked argument structure of this signature,
+    # discarding information about which arguments are semantically
+    # related to each other.
+    def arguments(self) -> Sequence[Binding]:
+        return cpp.arguments(
+            self.func.arguments,
+            faithful=self.faithful,
+            symint=self.symint,
+            method=self.method,
+            cpp_no_default_args=self.cpp_no_default_args,
+        )
+
+    def name(self, *, suppress_symint_suffix: bool = False) -> str:
+        n = cpp.name(
+            self.func,
+            faithful_name_for_out_overloads=self.faithful,
+            symint_overload=False if suppress_symint_suffix else self.symint,
+        )
+        if self.fallback_binding:
+            n = f"__dispatch_{n}"
+        return n
+
+    # Render the C++ declaration for this signature
+    def decl(
+        self,
+        *,
+        name: Optional[str] = None,
+        prefix: str = "",
+        is_redispatching_fn: bool = False,
+        suppress_symint_suffix: bool = False,
+    ) -> str:
+        returns_type = cpp.returns_type(
+            self.func.returns, symint=self.symint
+        ).cpp_type()
+        cpp_args = [a.decl() for a in self.arguments()]
+        if is_redispatching_fn:
+            cpp_args = ["c10::DispatchKeySet dispatchKeySet"] + cpp_args
+        cpp_args_str = ", ".join(cpp_args)
+        if name is None:
+            name = prefix + self.name(suppress_symint_suffix=suppress_symint_suffix)
+        return f"{returns_type} {name}({cpp_args_str})"
+
+    # Render the C++ definition for this signature, not including
+    # the body (with curly braces)
+    def defn(
+        self,
+        *,
+        name: Optional[str] = None,
+        prefix: str = "",
+        is_redispatching_fn: bool = False,
+    ) -> str:
+        returns_type = cpp.returns_type(
+            self.func.returns, symint=self.symint
+        ).cpp_type()
+        cpp_args = [a.defn() for a in self.arguments()]
+        if is_redispatching_fn:
+            cpp_args = ["c10::DispatchKeySet dispatchKeySet"] + cpp_args
+        cpp_args_str = ", ".join(cpp_args)
+        if name is None:
+            name = prefix + self.name()
+        return f"{returns_type} {name}({cpp_args_str})"
+
+    def ptr_type(self) -> str:
+        args_types_str = ", ".join(a.type for a in self.arguments())
+        return f"{cpp.returns_type(self.func.returns, symint=self.symint).cpp_type()} (*)({args_types_str})"
+
+    # Return the C++ function type, e.g., something like int(bool)
+    def type(self) -> str:
+        args_types_str = ", ".join(a.type for a in self.arguments())
+        return f"{cpp.returns_type(self.func.returns, symint=self.symint).cpp_type()} ({args_types_str})"
+
+
+# Represents group of all CppSignatures associated with a
+# FunctionSchema.  Right now, that's the regular, user-visible
+# signature, as well as a "faithful" signature which doesn't
+# have grouping.
+@dataclass(frozen=True)
+class CppSignatureGroup:
+    func: FunctionSchema
+    signature: CppSignature
+    faithful_signature: Optional[CppSignature]
+    symint_signature: Optional[CppSignature]
+    symint_faithful_signature: Optional[CppSignature]
+
+    def most_faithful_signature(self) -> CppSignature:
+        if self.faithful_signature:
+            return self.faithful_signature
+        else:
+            return self.signature
+
+    def signatures(self, *, symint: bool = True) -> Iterator[CppSignature]:
+        yield self.signature
+        if self.faithful_signature:
+            yield self.faithful_signature
+        if symint:
+            if self.symint_signature:
+                yield self.symint_signature
+            if self.symint_faithful_signature:
+                yield self.symint_faithful_signature
+
+    @staticmethod
+    def from_native_function(
+        f: NativeFunction, *, method: bool, fallback_binding: bool = False
+    ) -> "CppSignatureGroup":
+        func = f.func
+
+        def make_sig(*, faithful: bool, symint: bool) -> CppSignature:
+            return CppSignature(
+                func=func,
+                faithful=faithful,
+                symint=symint,
+                method=method,
+                fallback_binding=fallback_binding,
+                cpp_no_default_args=f.cpp_no_default_args,
+            )
+
+        def make_sigs(*, symint: bool) -> Tuple[CppSignature, Optional[CppSignature]]:
+            faithful_signature: Optional[CppSignature] = None
+            if func.arguments.tensor_options is not None or len(func.arguments.out) > 0:
+                faithful_signature = make_sig(faithful=True, symint=symint)
+            signature = make_sig(faithful=False, symint=symint)
+            return signature, faithful_signature
+
+        signature, faithful_signature = make_sigs(symint=False)
+        symint_signature: Optional[CppSignature] = None
+        symint_faithful_signature: Optional[CppSignature] = None
+        if func.has_symint():
+            symint_signature, symint_faithful_signature = make_sigs(symint=True)
+
+        return CppSignatureGroup(
+            func=func,
+            signature=signature,
+            faithful_signature=faithful_signature,
+            symint_signature=symint_signature,
+            symint_faithful_signature=symint_faithful_signature,
+        )
+
+
+@dataclass(frozen=True)
+class DispatcherSignature:
+    # The schema this signature is derived from
+    func: FunctionSchema
+
+    # Allows you to prepend an arbitrary prefix to the signature name.
+    # This is useful for parts of the codegen that generate wrappers around kernels,
+    # and need to avoid naming collisions.
+    prefix: str = ""
+
+    symint: bool = True
+
+    def arguments(self) -> List[Binding]:
+        return dispatcher.arguments(self.func, symint=self.symint)
+
+    def name(self) -> str:
+        return self.prefix + dispatcher.name(self.func)
+
+    def decl(self, name: Optional[str] = None) -> str:
+        args_str = ", ".join(a.decl() for a in self.arguments())
+        if name is None:
+            name = self.name()
+        return f"{self.returns_type().cpp_type()} {name}({args_str})"
+
+    def defn(
+        self, name: Optional[str] = None, *, is_redispatching_fn: bool = False
+    ) -> str:
+        args = [a.defn() for a in self.arguments()]
+        if is_redispatching_fn:
+            args = ["c10::DispatchKeySet dispatchKeySet"] + args
+        args_str = ", ".join(args)
+        if name is None:
+            name = self.name()
+        return f"{self.returns_type().cpp_type()} {name}({args_str})"
+
+    def exprs(self) -> List[Expr]:
+        return [Expr(a.name, a.nctype) for a in self.arguments()]
+
+    def returns_type(self) -> CType:
+        return dispatcher.returns_type(self.func.returns, symint=self.symint)
+
+    def ptr_type(self) -> str:
+        dispatcher_args_types_str = ", ".join(a.type for a in self.arguments())
+        return f"{self.returns_type().cpp_type()} (*)({dispatcher_args_types_str})"
+
+    # Return the C++ function type, e.g., something like int(bool)
+    def type(self) -> str:
+        dispatcher_args_types_str = ", ".join(a.type for a in self.arguments())
+        return f"{self.returns_type().cpp_type()} ({dispatcher_args_types_str})"
+
+    @staticmethod
+    def from_schema(
+        func: FunctionSchema, *, prefix: str = "", symint: bool = True
+    ) -> "DispatcherSignature":
+        return DispatcherSignature(func, prefix, symint)
+
+
+@dataclass(frozen=True)
+class NativeSignature:
+    # The schema this signature is derived from
+    func: FunctionSchema
+
+    symint: bool
+
+    prefix: str = ""
+
+    def name(self) -> str:
+        return self.prefix + native.name(self.func)
+
+    def decl(self, name: Optional[str] = None) -> str:
+        args_str = ", ".join(a.decl() for a in self.arguments())
+        if name is None:
+            name = self.name()
+        return f"{native.returns_type(self.func.returns, symint=self.symint).cpp_type()} {name}({args_str})"
+
+    def defn(self, name: Optional[str] = None) -> str:
+        args_str = ", ".join(a.defn() for a in self.arguments())
+        if name is None:
+            name = self.name()
+        return f"{native.returns_type(self.func.returns, symint=self.symint).cpp_type()} {name}({args_str})"
+
+    def ptr_type(self) -> str:
+        # don't include defaults in type signature!
+        args_str = ", ".join(a.defn() for a in self.arguments())
+        return f"{native.returns_type(self.func.returns, symint=self.symint).cpp_type()} (*)({args_str})"
+
+    def arguments(self) -> List[Binding]:
+        return native.arguments(self.func, symint=self.symint)
+
+    def returns_type(self) -> CType:
+        return native.returns_type(self.func.returns, symint=self.symint)
+
+    def dispatcher_exprs(self) -> List[Expr]:
+        return translate.translate(
+            self.arguments(), dispatcher.arguments(self.func), method=False
+        )
+
+
+@dataclass(frozen=True)
+class ViewInverseSignature:
+    g: NativeFunctionsViewGroup
+
+    def name(self) -> str:
+        return functionalization.reverse_name(self.g.view, include_namespace=False)
+
+    def decl(self) -> str:
+        return_type = functionalization.returns_type(self.g.view.func)
+        decls = [
+            a.decl()
+            for a in functionalization.inner_arguments(
+                self.g.view.func, is_reverse=True
+            )
+        ]
+        return f"static {return_type.cpp_type()} {self.name()}({', '.join(decls)});"
+
+
+@dataclass(frozen=True)
+class FunctionalizationLambda:
+    g: NativeFunctionsViewGroup
+
+    # are we generating the forward lambda or the reverse lambda?
+    is_reverse: bool
+
+    def captures(self) -> List[Expr]:
+        # The lambda lives inside of a kernel following the dispatcher API, so its outer context is the dispatcher arguments
+        # We also need to read the "reapply views" TLS at the time that the functionalization kernel was executed,
+        # and plumb it into the lambda.
+        outer_ctx = dispatcher.arguments(self.g.view.func) + [
+            functionalization.reapply_views_binding,
+            functionalization.inverse_return_mode_binding,
+        ]
+        capture_bindings = functionalization.capture_arguments(
+            self.g.view.func, is_reverse=self.is_reverse
+        )
+        # allow_expensive_conversions is set because we want to convert
+        # some reference types (IntArrayRef) to value types (vector<int64_t>).
+        capture_exprs = translate.translate(
+            outer_ctx, capture_bindings, method=False, allow_expensive_conversions=True
+        )
+        return capture_exprs
+
+    def decl(self) -> str:
+        return_type = functionalization.returns_type(self.g.view.func)
+        capture_str = ", ".join(
+            f"{val.type.name} = {val.expr}" for val in self.captures()
+        )
+        decls = [
+            a.decl()
+            for a in functionalization.outer_arguments(is_reverse=self.is_reverse)
+        ]
+        return f"[{capture_str}]({', '.join(decls)}) -> {return_type.cpp_type()}"
+
+    def inner_call(self, *, reapply_views: Optional[bool] = None) -> str:
+        inner_call_name = functionalization.name(
+            self.g,
+            is_reverse=self.is_reverse,
+            include_namespace=True,
+            reapply_views=reapply_views,
+        )
+
+        arg_ctx = functionalization.outer_arguments(is_reverse=self.is_reverse)
+        capture_ctx = functionalization.capture_arguments(
+            self.g.view.func, is_reverse=self.is_reverse
+        )
+        full_ctx = arg_ctx + capture_ctx
+
+        assert self.g.view_copy is not None
+        call_bindings = functionalization.inner_arguments(
+            self.g.view_copy.func, is_reverse=self.is_reverse
+        )
+        maybe_index = functionalization.inner_call_index(self.g.view_copy.func)
+        call_exprs = [
+            e.expr for e in translate.translate(full_ctx, call_bindings, method=False)
+        ]
+        if not self.is_reverse and maybe_index is not None:
+            return f'{inner_call_name}({", ".join(call_exprs)})[{maybe_index.name}];'
+        else:
+            return f'{inner_call_name}({", ".join(call_exprs)});'
+
+    @staticmethod
+    def from_func(
+        g: NativeFunctionsViewGroup, *, is_reverse: bool
+    ) -> "FunctionalizationLambda":
+        return FunctionalizationLambda(g, is_reverse)
+
+
+@dataclass(frozen=True)
+class StructuredImplSignature:
+    g: NativeFunctionsGroup
+    name: str
+
+    def defn(self, name: Optional[str] = None) -> str:
+        args_str = ", ".join(a.defn() for a in self.arguments())
+        return f"TORCH_IMPL_FUNC({self.name})({args_str})"
+
+    def arguments(self) -> List[Binding]:
+        return structured.impl_arguments(self.g)
+
+
+# Helper functions
+
+
+def kernel_signature(
+    f: NativeFunction, backend_index: BackendIndex, *, prefix: str = ""
+) -> Union["NativeSignature", "DispatcherSignature"]:
+    # Note [External Backends Follow Dispatcher API]
+    # Kernel signatures for in-tree backends follow the "native" API,
+    # while kernels for out-of-tree backends follow the dispatcher API.
+    # See the comments in `native.py` for details, but historically there have been
+    # some small differences in schema convention between them and the Dispatcher API.
+    # Any differences that require translating between the two will results in a runtime cost,
+    # so we'd like to keep the differences as small as possible.
+    # With external backends, we'd like to enforce that they write their kernels with schemas
+    # that match the Dispatcher API directly, if they can.
+    meta = backend_index.get_kernel(f)
+    symint = meta is not None and meta.supports_symint()
+    if symint:
+        assert (
+            f.func.has_symint()
+        ), f"attempted to define symint kernel for {backend_index.dispatch_key} without SymInt in schema"
+    if backend_index.external:
+        return DispatcherSignature.from_schema(f.func, prefix=prefix, symint=symint)
+    else:
+        return NativeSignature(f.func, prefix=prefix, symint=symint)
+
+
+# Functions only, no types
+from torchgen.api import (
+    cpp,
+    dispatcher,
+    functionalization,
+    native,
+    structured,
+    translate,
+)
diff --git a/MLPY/Lib/site-packages/torchgen/api/types/types.py b/MLPY/Lib/site-packages/torchgen/api/types/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d686ce5418dc8cd8c637add5586555a985eddf5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/types/types.py
@@ -0,0 +1,190 @@
+"""
+Where should I add a new type? `types_base.py` vs `types.py`
+
+This file defines data model classes for torchgen typing system, as well as some base types such as int32_t.
+
+`types.py` defines ATen Tensor type and some c10 types, along with signatures that use these types.
+
+The difference between these two files, is `types_base.py` should be implementation-agnostic, meaning it shouldn't
+contain any type definition that is tight to a specific C++ library (e.g., ATen), so that it can be easily reused
+if we want to generate code for another C++ library.
+
+Add new types to `types.py` if these types are ATen/c10 related.
+Add new types to `types_base.py` if they are basic and not attached to ATen/c10.
+"""
+from dataclasses import dataclass
+from typing import Dict
+
+from torchgen.model import BaseTy, ScalarType
+
+from .types_base import (
+    BaseCppType,
+    BaseCType,
+    boolT,
+    byteT,
+    charT,
+    CType,
+    doubleT,
+    floatT,
+    int32T,
+    longT,
+    shortT,
+)
+
+
+TENSOR_LIST_LIKE_CTYPES = [
+    "at::TensorList",
+    "const c10::List<c10::optional<at::Tensor>> &",
+    "const at::ITensorListRef &",
+]
+
+
+halfT = BaseCppType("at", "Half")
+complexHalfT = BaseCppType(
+    "c10", "complex<c10::Half>"
+)  # stuffing template param here is an abuse
+complexFloatT = BaseCppType("c10", "complex<float>")
+complexDoubleT = BaseCppType("c10", "complex<double>")
+bfloat16T = BaseCppType("at", "BFloat16")
+float8_e5m2T = BaseCppType("at", "Float8_e5m2")
+float8_e5m2fnuzT = BaseCppType("at", "Float8_e5m2fnuz")
+float8_e4m3fnT = BaseCppType("at", "Float8_e4m3fn")
+float8_e4m3fnuzT = BaseCppType("at", "Float8_e4m3fnuz")
+stringT = BaseCppType("c10", "string_view")
+generatorT = BaseCppType("at", "Generator")
+scalarTypeT = BaseCppType("at", "ScalarType")
+tensorT = BaseCppType("at", "Tensor")
+optionalTensorRefT = BaseCppType("at", "OptionalTensorRef")
+tensorListT = BaseCppType("at", "TensorList")
+iTensorListRefT = BaseCppType("at", "ITensorListRef")
+iOptTensorListRefT = BaseCppType("at", "IOptTensorListRef")
+dimnameT = BaseCppType("at", "Dimname")
+dimnameListT = BaseCppType("at", "DimnameList")
+dimVectorT = BaseCppType("at", "DimVector")
+layoutT = BaseCppType("at", "Layout")
+deviceT = BaseCppType("at", "Device")
+deviceIndexT = BaseCppType("at", "DeviceIndex")
+scalarT = BaseCppType("at", "Scalar")
+optionalScalarRefT = BaseCppType("at", "OptionalScalarRef")
+memoryFormatT = BaseCppType("at", "MemoryFormat")
+qschemeT = BaseCppType("at", "QScheme")
+storageT = BaseCppType("at", "Storage")
+streamT = BaseCppType("at", "Stream")
+intArrayRefT = BaseCppType("at", "IntArrayRef")
+optionalIntArrayRefT = BaseCppType("at", "OptionalIntArrayRef")
+optionalSymIntArrayRefT = BaseCppType("at", "OptionalSymIntArrayRef")
+tensorOptionsT = BaseCppType("at", "TensorOptions")
+typeAndSizeT = BaseCppType("torch::autograd::generated", "TypeAndSize")
+tensorGeometryT = BaseCppType("at", "TensorGeometry")
+SymIntT = BaseCppType("c10", "SymInt")
+symIntArrayRefT = BaseCppType("c10", "SymIntArrayRef")
+
+# Types representing template parameters.  Technically, we probably shouldn't
+# represent them this way in codegen, but it was pretty convenient.
+scalar_t = BaseCppType("", "scalar_t")
+opmath_t = BaseCppType("", "opmath_t")
+
+ScalarTypeToCppMapping: Dict[ScalarType, BaseCppType] = {
+    ScalarType.Byte: byteT,
+    ScalarType.Char: charT,
+    ScalarType.Short: shortT,
+    ScalarType.Int: int32T,
+    ScalarType.Long: longT,
+    ScalarType.Half: halfT,
+    ScalarType.Float: floatT,
+    ScalarType.Double: doubleT,
+    ScalarType.ComplexHalf: complexHalfT,
+    ScalarType.ComplexFloat: complexFloatT,
+    ScalarType.ComplexDouble: complexDoubleT,
+    ScalarType.Bool: boolT,
+    ScalarType.Float8_e5m2: float8_e5m2T,
+    ScalarType.Float8_e5m2fnuz: float8_e5m2fnuzT,
+    ScalarType.Float8_e4m3fn: float8_e4m3fnT,
+    ScalarType.Float8_e4m3fnuz: float8_e4m3fnuzT,
+}
+
+BaseTypeToCppMapping: Dict[BaseTy, BaseCppType] = {
+    BaseTy.int: longT,
+    BaseTy.float: doubleT,
+    BaseTy.bool: boolT,
+    BaseTy.str: stringT,
+    BaseTy.Generator: generatorT,
+    BaseTy.ScalarType: scalarTypeT,
+    BaseTy.Tensor: tensorT,
+    BaseTy.Dimname: dimnameT,
+    BaseTy.DimVector: dimVectorT,
+    BaseTy.Layout: layoutT,
+    BaseTy.Device: deviceT,
+    BaseTy.DeviceIndex: deviceIndexT,
+    BaseTy.Scalar: scalarT,
+    BaseTy.MemoryFormat: memoryFormatT,
+    BaseTy.QScheme: qschemeT,
+    BaseTy.Storage: storageT,
+    BaseTy.Stream: streamT,
+    BaseTy.SymInt: SymIntT,
+}
+
+# CTypes encode C++ type structure as needed for translation.
+
+
+@dataclass(frozen=True)
+class OptionalCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"c10::optional<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"c10::optional<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return OptionalCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ListCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"c10::List<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"c10::List<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return ListCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ArrayRefCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"at::ArrayRef<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"ArrayRef<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return ArrayRefCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class VectorizedCType(CType):
+    # This template is explicitly specialized, so the only valid
+    # elems are those we have specializations for (e.g., float, double, ...)
+    # scalar_t is also a common argument here (when we are codegen in
+    # a templated context)
+    elem: BaseCType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        return f"at::vec::Vectorized<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        raise NotImplementedError
+
+    def remove_const_ref(self) -> "CType":
+        return self
diff --git a/MLPY/Lib/site-packages/torchgen/api/types/types_base.py b/MLPY/Lib/site-packages/torchgen/api/types/types_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..a53015f3a7f2778aad39394892f790b2cc7e2620
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/types/types_base.py
@@ -0,0 +1,270 @@
+"""
+Where should I add a new type? `types_base.py` vs `types.py`
+
+This file defines data model classes for torchgen typing system, as well as some base types such as int32_t.
+
+`types.py` defines ATen Tensor type and some c10 types, along with signatures that use these types.
+
+The difference between these two files, is `types_base.py` should be implementation-agnostic, meaning it shouldn't
+contain any type definition that is tight to a specific C++ library (e.g., ATen), so that it can be easily reused
+if we want to generate code for another C++ library.
+
+Add new types to `types.py` if these types are ATen/c10 related.
+Add new types to `types_base.py` if they are basic and not attached to ATen/c10.
+"""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import auto, Enum
+from typing import List, Optional, Union
+
+from torchgen.model import Argument, SelfArgument, TensorOptionsArguments
+
+# An ArgName is just the str name of the argument in schema;
+# but in some special circumstances, we may add a little extra
+# context.  The Enum SpecialArgName covers all of these cases;
+# grep for their construction sites to see when they can occur.
+
+
+class SpecialArgName(Enum):
+    possibly_redundant_memory_format = auto()
+
+
+ArgName = Union[str, SpecialArgName]
+
+
+# This class shouldn't be created directly; instead, use/create one of the singletons below.
+@dataclass(frozen=True)
+class BaseCppType:
+    ns: Optional[str]
+    name: str
+
+    def __str__(self) -> str:
+        if self.ns is None or self.ns == "":
+            return self.name
+        return f"{self.ns}::{self.name}"
+
+
+# The set of all non-templated, valid, fully-qualified names of C++ types that are used in the codegen.
+# Templated types get their own dataclass, mainly to make namespace parsing easier.
+byteT = BaseCppType("", "uint8_t")
+charT = BaseCppType("", "int8_t")
+shortT = BaseCppType("", "int16_t")
+# It would be more symmetric for this to be called intT, but it easy to mix
+# this up with JIT int (which is int64_t in C++), so we intentionally don't
+# define intT to make it obvious when you've stuffed it up
+int32T = BaseCppType("", "int32_t")
+longT = BaseCppType("", "int64_t")
+doubleT = BaseCppType("", "double")
+floatT = BaseCppType("", "float")
+boolT = BaseCppType("", "bool")
+voidT = BaseCppType("", "void")
+
+
+class CType(ABC):
+    @abstractmethod
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        raise NotImplementedError
+
+    @abstractmethod
+    def cpp_type_registration_declarations(self) -> str:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_const_ref(self) -> "CType":
+        return self
+
+
+@dataclass(frozen=True)
+class BaseCType(CType):
+    type: BaseCppType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        return str(self.type)
+
+    # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml
+    # TODO: Kill this when we eventually remove it!
+    def cpp_type_registration_declarations(self) -> str:
+        return str(self.type).replace("at::", "")
+
+    def remove_const_ref(self) -> "CType":
+        return self
+
+
+@dataclass(frozen=True)
+class ConstRefCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        if strip_ref:
+            return self.elem.cpp_type(strip_ref=strip_ref)
+        return f"const {self.elem.cpp_type()} &"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"const {self.elem.cpp_type_registration_declarations()} &"
+
+    def remove_const_ref(self) -> "CType":
+        return self.elem.remove_const_ref()
+
+
+@dataclass(frozen=True)
+class VectorCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"::std::vector<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"::std::vector<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return VectorCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ArrayCType(CType):
+    elem: "CType"
+    size: int
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"::std::array<{self.elem.cpp_type()},{self.size}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"::std::array<{self.elem.cpp_type_registration_declarations()},{self.size}>"
+
+    def remove_const_ref(self) -> "CType":
+        return ArrayCType(self.elem.remove_const_ref(), self.size)
+
+
+@dataclass(frozen=True)
+class TupleCType(CType):
+    elems: List["CType"]
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f'::std::tuple<{",".join([e.cpp_type() for e in self.elems])}>'
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f'::std::tuple<{",".join([e.cpp_type_registration_declarations() for e in self.elems])}>'
+
+    def remove_const_ref(self) -> "CType":
+        return TupleCType([e.remove_const_ref() for e in self.elems])
+
+
+@dataclass(frozen=True)
+class MutRefCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        if strip_ref:
+            return self.elem.cpp_type(strip_ref=strip_ref)
+        return f"{self.elem.cpp_type()} &"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"{self.elem.cpp_type_registration_declarations()} &"
+
+    def remove_const_ref(self) -> "CType":
+        return self.elem.remove_const_ref()
+
+
+# A NamedCType is short for Named C++ semantic type.  A NamedCType represents a C++ type, plus
+# semantic information about what it represents.  For example, consider the
+# argument "bool pin_memory"; its normal C++ type is "bool", but its C++
+# semantic type also keeps track that this represents a "pin_memory"; you can't
+# just use a random other boolean in a context where you need a "pin_memory"!
+#
+
+
+@dataclass(frozen=True)
+class NamedCType:
+    name: ArgName
+    type: CType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        return self.type.cpp_type(strip_ref=strip_ref)
+
+    # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml
+    # TODO: Kill this when we eventually remove it!
+    def cpp_type_registration_declarations(self) -> str:
+        return self.type.cpp_type_registration_declarations()
+
+    def remove_const_ref(self) -> "NamedCType":
+        return NamedCType(self.name, self.type.remove_const_ref())
+
+    def with_name(self, name: str) -> "NamedCType":
+        return NamedCType(name, self.type)
+
+
+# A binding represents any C++ binding site for a formal parameter.
+# We don't distinguish between binding sites for different APIs;
+# instead, all of the important distinctions are encoded in CType,
+# which you can use to figure out if a given Binding is appropriate
+# for use in another context.  (See torchgen.api.translate)
+
+
+@dataclass(frozen=True)
+class Binding:
+    name: str
+    nctype: NamedCType
+    argument: Union[Argument, TensorOptionsArguments, SelfArgument]
+    # TODO: maybe don't represent default here
+    default: Optional[str] = None
+
+    def rename(self, name: str) -> "Binding":
+        return Binding(
+            name=name,
+            nctype=self.nctype,
+            argument=self.argument,
+            default=self.default,
+        )
+
+    @property
+    def type(self) -> str:
+        return self.nctype.cpp_type()
+
+    def no_default(self) -> "Binding":
+        return Binding(
+            name=self.name,
+            nctype=self.nctype,
+            default=None,
+            argument=self.argument,
+        )
+
+    def decl(self, *, func_ptr_cast: bool = False) -> str:
+        mb_default = ""
+        if self.default is not None:
+            mb_default = f"={self.default}"
+
+        # casting only needs to know the type
+        if func_ptr_cast:
+            return f"{self.type}"
+        else:
+            return f"{self.type} {self.name}{mb_default}"
+
+    # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml
+    # TODO: Kill this when we eventually remove it!
+    def decl_registration_declarations(self) -> str:
+        type_s = self.nctype.cpp_type_registration_declarations()
+        mb_default = ""
+        if self.default is not None:
+            mb_default = f"={self.default}"
+        return f"{type_s} {self.name}{mb_default}"
+
+    def defn(self) -> str:
+        return f"{self.type} {self.name}"
+
+    def with_name(self, name: str) -> "Binding":
+        return Binding(
+            name=name, nctype=self.nctype, argument=self.argument, default=self.default
+        )
+
+
+# An Expr is a C++ expression.  It has a C++ string representing its syntax,
+# as well as a CType saying what it provides.
+
+
+@dataclass(frozen=True)
+class Expr:
+    expr: str
+    type: NamedCType
diff --git a/MLPY/Lib/site-packages/torchgen/api/ufunc.py b/MLPY/Lib/site-packages/torchgen/api/ufunc.py
new file mode 100644
index 0000000000000000000000000000000000000000..01d8e9598ab5f74eafc40e86b8e0f917dd0b2e8c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/ufunc.py
@@ -0,0 +1,209 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torchgen.api.types as api_types
+
+from torchgen.api import cpp, structured
+from torchgen.api.types import (
+    ArgName,
+    BaseCppType,
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    CType,
+    NamedCType,
+    scalarT,
+)
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    DispatchKey,
+    FunctionSchema,
+    NativeFunctionsGroup,
+    Type,
+)
+
+
+def schema_kernel_name(func: FunctionSchema, dispatch_key: DispatchKey) -> str:
+    assert func.is_out_fn(), "ufunc.kernel_name should only be invoked on out schemas"
+    return f"ufunc_{func.name.name}_{dispatch_key}"
+
+
+def kernel_name(g: NativeFunctionsGroup, dispatch_key: DispatchKey) -> str:
+    return schema_kernel_name(g.out.func, dispatch_key)
+
+
+# Tensors are omitted (as they are stored in TensorIterator), everything else is
+# passed along  (technically, we can pass tensors along too, it just wastes
+# argument registers)
+#
+# NB: used for CPU only
+def dispatchstub_type(t: Type, *, binds: ArgName) -> Optional[NamedCType]:
+    # Dispatch stubs are always plain ints
+    r = cpp.valuetype_type(t, binds=binds, symint=False)
+    if r is not None:
+        return r
+
+    if t == BaseType(BaseTy.Scalar):
+        return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+    elif t == BaseType(BaseTy.Tensor):
+        return None
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+def opmath_type(scalar_t: BaseCppType) -> BaseCppType:
+    if scalar_t == api_types.scalar_t:
+        return api_types.opmath_t
+    raise NotImplementedError
+
+
+# NB: Tensors in constructor are stored in opmath_t, not scalar_t
+# because Tensor in constructor = its a scalar tensor partially applied =
+# it can be higher precision and we want to compute in that higher precision
+#
+# NB: CUDA only
+def ufunctor_ctor_type(t: Type, *, binds: ArgName, scalar_t: BaseCppType) -> NamedCType:
+    r = cpp.valuetype_type(t, binds=binds, symint=False)
+    if r is not None:
+        return r
+
+    if t == BaseType(BaseTy.Scalar):
+        return NamedCType(binds, BaseCType(opmath_type(scalar_t)))
+    elif t == BaseType(BaseTy.Tensor):
+        return NamedCType(binds, BaseCType(opmath_type(scalar_t)))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Only Tensors ever get passed directly to operator()
+#
+# NB: CUDA only
+# (Actually, this works for CPU too)
+def ufunctor_apply_type(
+    t: Type, *, binds: ArgName, scalar_t: BaseCppType
+) -> NamedCType:
+    if t == BaseType(BaseTy.Tensor):
+        return NamedCType(binds, BaseCType(scalar_t))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# The actual ufunc template function the user writes.  Everything here
+# is done in the computation type.  compute_t is opmath_t in CUDA and scalar_t
+# in CPU
+def ufunc_type(t: Type, *, binds: ArgName, compute_t: CType) -> NamedCType:
+    r = cpp.valuetype_type(t, binds=binds, symint=False)
+    if r is not None:
+        return r
+
+    if t == BaseType(BaseTy.Scalar):
+        return NamedCType(binds, compute_t)
+    elif t == BaseType(BaseTy.Tensor):
+        return NamedCType(binds, compute_t)
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+def ufunctor_ctor_argument(a: Argument, scalar_t: BaseCppType) -> Binding:
+    return Binding(
+        nctype=ufunctor_ctor_type(a.type, binds=a.name, scalar_t=scalar_t),
+        name=a.name,
+        default=None,
+        argument=a,
+    )
+
+
+def ufunctor_apply_argument(a: Argument, scalar_t: BaseCppType) -> Binding:
+    return Binding(
+        nctype=ufunctor_apply_type(a.type, binds=a.name, scalar_t=scalar_t),
+        name=a.name,
+        default=None,
+        argument=a,
+    )
+
+
+def ufunc_argument(a: Argument, compute_t: CType) -> Binding:
+    return Binding(
+        nctype=ufunc_type(a.type, binds=a.name, compute_t=compute_t),
+        name=a.name,
+        default=None,
+        argument=a,
+    )
+
+
+@dataclass(frozen=True)
+class UfunctorBindings:
+    ctor: List[Binding]
+    apply: List[Binding]
+
+
+# ufunctors are a CUDA-only concept representing functors that take some of
+# their arguments on a host-side constructor, and the rest in the device-side
+# apply.  E.g.,
+#
+# template <typename scalar_t>
+# struct CUDAFunctorOnSelf_add {
+#   using opmath_t = at::opmath_type<scalar_t>;
+#   opmath_t other_;
+#   opmath_t alpha_;
+#   CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha) : other_(other), alpha_(alpha) {}
+#   __device__ scalar_t operator()(scalar_t self) {
+#     return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
+#   }
+# };
+#
+# The ctor refers to the constructor CUDAFunctorOnSelf_add, while apply refers
+# to the operator() definition
+def ufunctor_arguments(
+    g: NativeFunctionsGroup, *, scalar_tensor_idx: Optional[int], scalar_t: BaseCppType
+) -> UfunctorBindings:
+    ctor = []
+    apply = []
+    for a in g.functional.func.arguments.flat_non_out:
+        if a.type.is_tensor_like():
+            if scalar_tensor_idx == 0:
+                # put it in the ctor anyway
+                ctor.append(ufunctor_ctor_argument(a, scalar_t=scalar_t))
+                scalar_tensor_idx = None
+            else:
+                if scalar_tensor_idx is not None:
+                    scalar_tensor_idx -= 1
+                apply.append(ufunctor_apply_argument(a, scalar_t=scalar_t))
+        else:
+            ctor.append(ufunctor_ctor_argument(a, scalar_t=scalar_t))
+    assert scalar_tensor_idx is None
+    return UfunctorBindings(ctor=ctor, apply=apply)
+
+
+# ufuncs are the inner loop template functions that you wrote in ufunc/add.h
+# which do the actual computation in question.  E.g.,
+#
+# template <typename T>
+# C10_HOST_DEVICE T add(T self, T other, T alpha) __ubsan_ignore_undefined__ {
+#   return self + alpha * other;
+# }
+#
+# In this file, we refer to T as compute_t which is bound by caller
+def ufunc_arguments(g: NativeFunctionsGroup, *, compute_t: CType) -> List[Binding]:
+    return [
+        ufunc_argument(a, compute_t=compute_t)
+        for a in g.functional.func.arguments.flat_non_out
+    ]
+
+
+# Stubs are the DispatchStub trampolines that CPU kernels use to get to their
+# vectorized versions.  E.g.,
+#
+# using structured_binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha);
+# DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub);
+def stub_arguments(g: NativeFunctionsGroup) -> List[Binding]:
+    # stubs drop all tensor arguments (they are implicit in the TensorIterator
+    # argument and keep everything else)
+    return [
+        r
+        for a in g.out.func.arguments.flat_non_out
+        if not a.type.is_tensor_like()
+        for r in structured.argument(a)
+    ]
diff --git a/MLPY/Lib/site-packages/torchgen/api/unboxing.py b/MLPY/Lib/site-packages/torchgen/api/unboxing.py
new file mode 100644
index 0000000000000000000000000000000000000000..60d671d024733ed05b69c0f30f043daadd904b11
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/api/unboxing.py
@@ -0,0 +1,248 @@
+from typing import List, Tuple
+
+from torchgen.api import cpp
+from torchgen.api.types import Binding, CppSignatureGroup, CType
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Type,
+)
+
+# This file generates the code for unboxing wrappers, i.e., the glue logic to unbox a boxed operator and convert the
+# ivalues from stack to correct arguments to the unboxed kernel, based on corresponding JIT schema. This codegen is
+# an alternative way to generate unboxing wrappers similar to the existing C++ metaprogramming approach but gets the
+# job done statically. These generated unboxing wrappers will be useful under the scenario where we need to register
+# a fixed set of operators known at compile time and thus can save some time in runtime initialization phase.
+#
+# Here's an example on how the codegen works:
+#
+# - Function Schema (source of truth)
+#
+#      aten::empty.names(int[] size, *, Dimname[]? names,
+#                        ScalarType? dtype=None, Layout? layout=None,
+#                        Device? device=None, bool? pin_memory=None,
+#                        MemoryFormat? memory_format=None) -> Tensor
+# - Argument Conversion
+#       Generates C++ code to convert an ivalue (from stack) to its underlying C++ type.
+#    - int[] size
+#        ```cpp
+#           const c10::List<c10::IValue> size_list_in = (std::move(peek(stack, 0, 7))).toList();
+#
+#           std::vector<int64_t> size_vec;
+#           for (c10::IValue size_elem: size_list_in) {
+#               int64_t size_base = size_elem.to<int64_t>();
+#               size_vec.push_back(size_base);
+#           }
+#           at::ArrayRef<int64_t> size_list_out(size_vec);
+#                                 ~~~~~~~~~~~~~ <-- The converted argument from ivalues in the stack.
+#                                                   Will be passed to unboxed kernel.
+#       ```
+#    - Dimname[]? names
+#       ```cpp
+#           c10::optional<c10::IValue> names_opt = (std::move(peek(stack, 1, 7))).toOptional<c10::IValue>();
+#           c10::optional<at::ArrayRef<at::Dimname>> names_opt_out;
+#           if (names_opt.has_value()) {
+#                         ~~~~~~~~~~~ <-- Unwrapping optional shell
+#               const c10::IValue names_opt_in = names_opt.value();
+#               const c10::List<c10::IValue> names_list_in = names_opt_in.toList();
+#
+#               std::vector<at::Dimname> names_vec;
+#               for (c10::IValue names_elem: names_list_in) {
+#                                ~~~~~~~~~~~~~~~~~~~~~~~~~ <-- Unrolling list, then convert elements one by one.
+#                   at::Dimname names_base = names_elem.to<at::Dimname>();
+#                   names_vec.push_back(names_base);
+#               }
+#               at::ArrayRef<at::Dimname> names_list_out(names_vec);
+#
+#               names_opt_out = c10::optional<at::ArrayRef<at::Dimname>>(names_list_out);
+#           } else {
+#               names_opt_out = c10::optional<at::ArrayRef<at::Dimname>>();
+#           }
+#       ```
+#    - ScalarType? dtype (similarly for the rest of the arguments)
+#       ```cpp
+#           c10::optional<c10::IValue> dtype_opt = (std::move(peek(stack, 2, 7))).toOptional<c10::IValue>();
+#           c10::optional<at::ScalarType> dtype_opt_out;
+#           if (dtype_opt.has_value()) {
+#               const c10::IValue dtype_opt_in = dtype_opt.value();
+#               at::ScalarType dtype_base = dtype_opt_in.to<at::ScalarType>();
+#                                                        ~~~~~~~~~~~~~~~~~~~~ <-- For base types, convert ivalue to it
+#                                                                                 directly using ".to<T>()" API.
+#               dtype_opt_out = c10::optional<at::ScalarType>(dtype_base);
+#           } else {
+#               dtype_opt_out = c10::optional<at::ScalarType>();
+#           }
+#       ```
+#
+# - Unboxed Kernel Call
+#   ```cpp
+#       auto result_ = torch::empty(
+#           size_list_out,
+#           names_opt_out,
+#           options,
+#           memory_format_opt_out
+#       );
+#   ```
+#
+# - Push Result Back to Stack
+#   ```cpp
+#       drop(stack, 7);
+#       pack(stack, std::move(result_));
+#   ```
+connector = "\n\t"
+
+
+# Return unboxing function name for a NativeFunction
+def name(f: NativeFunction) -> str:
+    return f.func.name.unambiguous_name()
+
+
+# Convert all the arguments in a NativeFunction to C++ code
+def convert_arguments(f: NativeFunction) -> Tuple[List[Binding], List[str]]:
+    # we need the 'self' argument so method needs to be False
+    args = (
+        CppSignatureGroup.from_native_function(f, method=False)
+        .most_faithful_signature()
+        .arguments()
+    )
+    code_list = [
+        f"c10::IValue {args[i].name} = std::move(peek(stack, {i}, {len(args)}));"
+        for i in range(len(args))
+    ] + [""]
+    binding_list = []
+    for arg in args:
+        # expecting only Argument
+        if not isinstance(arg.argument, Argument):
+            raise Exception(
+                f"Unexpected argument type, expecting `Argument` but got {arg}"
+            )
+        argument: Argument = arg.argument
+        unboxed_name, _, code, decl = argumenttype_ivalue_convert(
+            argument.type,
+            argument.name,
+            mutable=argument.is_write,
+        )
+        code_list.extend(decl)
+        code_list.extend(code)
+        binding_list.append(arg.with_name(unboxed_name))
+    return binding_list, code_list
+
+
+# Takes in the type, name and mutability corresponding to an argument, and generates a tuple of:
+# (1) the C++ code necessary to unbox the argument
+# (2) A Binding corresponding to the newly created unboxed variable, including variable name and its CType
+def argumenttype_ivalue_convert(
+    t: Type, arg_name: str, *, mutable: bool = False
+) -> Tuple[str, CType, List[str], List[str]]:
+    # Unboxing is for mobile, which doesn't care about SymInts
+    ctype = cpp.argumenttype_type(
+        t=t, mutable=mutable, binds=arg_name, symint=False
+    ).type
+
+    if isinstance(t, BaseType):
+        out_name = f"{arg_name}_base"
+        code, decl = _gen_code_base_type(
+            arg_name=arg_name, out_name=out_name, ctype=ctype
+        )
+    elif isinstance(t, OptionalType):
+        out_name = f"{arg_name}_opt_out"
+        code, decl = _gen_code_optional_type(
+            arg_name=arg_name,
+            out_name=out_name,
+            t=t,
+            ctype=ctype,
+        )
+    elif isinstance(t, ListType):
+        out_name = f"{arg_name}_list_out"
+        code, decl = _gen_code_list_type(
+            arg_name=arg_name,
+            out_name=out_name,
+            t=t,
+            ctype=ctype,
+        )
+    else:
+        raise Exception(f"Cannot handle type {t}. arg_name: {arg_name}")
+    return out_name, ctype, code, decl
+
+
+def _gen_code_base_type(
+    arg_name: str, out_name: str, ctype: CType
+) -> Tuple[List[str], List[str]]:
+    return [
+        f"{ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.to<{ctype.cpp_type(strip_ref=True)}>();"
+    ], []
+
+
+def _gen_code_optional_type(
+    arg_name: str, out_name: str, t: OptionalType, ctype: CType
+) -> Tuple[List[str], List[str]]:
+    in_name = f"{arg_name}_opt_in"
+    res_name, _, res_code, decl = argumenttype_ivalue_convert(t.elem, in_name)
+    return (
+        f"""
+c10::optional<c10::IValue> {arg_name}_opt = {arg_name}.toOptional<c10::IValue>();
+{ctype.cpp_type(strip_ref=True)} {out_name};
+if ({arg_name}_opt.has_value()) {{
+    const c10::IValue {in_name} = {arg_name}_opt.value();
+    {connector.join(res_code)}
+    {out_name} = {ctype.cpp_type(strip_ref=True)}({res_name});
+}} else {{
+    {out_name} = {ctype.cpp_type(strip_ref=True)}();
+}}
+        """.split(
+            "\n"
+        ),
+        decl,
+    )
+
+
+def _gen_code_list_type(
+    arg_name: str, out_name: str, t: ListType, ctype: CType
+) -> Tuple[List[str], List[str]]:
+    in_name = f"{arg_name}_list_in"
+    elem_name = f"{arg_name}_elem"
+    code = [f"const c10::List<c10::IValue> {in_name} = {arg_name}.toList();"]
+    res_name, res_ctype, res_code, decl = argumenttype_ivalue_convert(t.elem, elem_name)
+    # handle list type with size, e.g., bool[4]
+    if isinstance(t.elem, BaseType) and t.elem.name == BaseTy.bool and t.size:
+        code.extend(
+            f"""
+{ctype.cpp_type(strip_ref=True)} {out_name} = as_array<{res_ctype.cpp_type(strip_ref=True)}, {t.size}>({in_name});
+            """.split(
+                "\n"
+            )
+        )
+    # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<c10::optional<at::Tensor>>
+    elif isinstance(t.elem, OptionalType):
+        code.extend(
+            f"""
+{ctype.cpp_type(strip_ref=True)} {out_name};
+for (c10::IValue {elem_name}: {in_name}) {{
+    {connector.join(res_code)}
+    {out_name}.push_back({res_name});
+}}
+            """.split(
+                "\n"
+            )
+        )
+    else:
+        # use ArrayRef as default.
+        vec_name = arg_name + "_vec"
+        # need to bring vector instantiation out of scope so that ArrayRef has valid data
+        decl.append(f"std::vector<{res_ctype.cpp_type(strip_ref=True)}> {vec_name};")
+        code.extend(
+            f"""
+for (c10::IValue {elem_name}: {in_name}) {{
+    {connector.join(res_code)}
+    {vec_name}.push_back({res_name});
+}}
+{ctype.cpp_type(strip_ref=True)} {out_name}({vec_name});
+            """.split(
+                "\n"
+            )
+        )
+    return code, decl
diff --git a/MLPY/Lib/site-packages/torchgen/code_template.py b/MLPY/Lib/site-packages/torchgen/code_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..01784303507057a13c95d2853ca84744cd9e237a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/code_template.py
@@ -0,0 +1,96 @@
+import re
+from typing import Mapping, Match, Optional, Sequence
+
+# match $identifier or ${identifier} and replace with value in env
+# If this identifier is at the beginning of whitespace on a line
+# and its value is a list then it is treated as
+# block substitution by indenting to that depth and putting each element
+# of the list on its own line
+# if the identifier is on a line starting with non-whitespace and a list
+# then it is comma separated ${,foo} will insert a comma before the list
+# if this list is not empty and ${foo,} will insert one after.
+
+
+class CodeTemplate:
+    substitution_str = r"(^[^\n\S]*)?\$([^\d\W]\w*|\{,?[^\d\W]\w*\,?})"
+    substitution = re.compile(substitution_str, re.MULTILINE)
+
+    pattern: str
+    filename: str
+
+    @staticmethod
+    def from_file(filename: str) -> "CodeTemplate":
+        with open(filename) as f:
+            return CodeTemplate(f.read(), filename)
+
+    def __init__(self, pattern: str, filename: str = "") -> None:
+        self.pattern = pattern
+        self.filename = filename
+
+    def substitute(
+        self, env: Optional[Mapping[str, object]] = None, **kwargs: object
+    ) -> str:
+        if env is None:
+            env = {}
+
+        def lookup(v: str) -> object:
+            assert env is not None
+            return kwargs[v] if v in kwargs else env[v]
+
+        def indent_lines(indent: str, v: Sequence[object]) -> str:
+            return "".join(
+                [indent + l + "\n" for e in v for l in str(e).splitlines()]
+            ).rstrip()
+
+        def replace(match: Match[str]) -> str:
+            indent = match.group(1)
+            key = match.group(2)
+            comma_before = ""
+            comma_after = ""
+            if key[0] == "{":
+                key = key[1:-1]
+                if key[0] == ",":
+                    comma_before = ", "
+                    key = key[1:]
+                if key[-1] == ",":
+                    comma_after = ", "
+                    key = key[:-1]
+            v = lookup(key)
+            if indent is not None:
+                if not isinstance(v, list):
+                    v = [v]
+                return indent_lines(indent, v)
+            elif isinstance(v, list):
+                middle = ", ".join([str(x) for x in v])
+                if len(v) == 0:
+                    return middle
+                return comma_before + middle + comma_after
+            else:
+                return str(v)
+
+        return self.substitution.sub(replace, self.pattern)
+
+
+if __name__ == "__main__":
+    c = CodeTemplate(
+        """\
+    int foo($args) {
+
+        $bar
+            $bar
+        $a+$b
+    }
+    int commatest(int a${,stuff})
+    int notest(int a${,empty,})
+    """
+    )
+    print(
+        c.substitute(
+            args=["hi", 8],
+            bar=["what", 7],
+            a=3,
+            b=4,
+            stuff=["things...", "others"],
+            empty=[],
+        )
+    )
diff --git a/MLPY/Lib/site-packages/torchgen/context.py b/MLPY/Lib/site-packages/torchgen/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e3b4772b5a4b89996c5b66c18a6d543cd8955ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/context.py
@@ -0,0 +1,128 @@
+import contextlib
+
+import functools
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union
+
+import torchgen.local as local
+from torchgen.model import (
+    BackendIndex,
+    DispatchKey,
+    NativeFunction,
+    NativeFunctionsGroup,
+    NativeFunctionsViewGroup,
+)
+from torchgen.utils import context, S, T
+
+# Helper functions for defining generators on things in the model
+
+F = TypeVar(
+    "F",
+    NativeFunction,
+    NativeFunctionsGroup,
+    NativeFunctionsViewGroup,
+    Union[NativeFunction, NativeFunctionsGroup],
+    Union[NativeFunction, NativeFunctionsViewGroup],
+)
+
+F2 = TypeVar(
+    "F2",
+    NativeFunction,
+    NativeFunctionsGroup,
+    Optional[NativeFunction],
+    bool,
+    str,
+)
+
+F3 = TypeVar("F3", Tuple[NativeFunction, Any], List[NativeFunction])
+
+
+@contextlib.contextmanager
+def native_function_manager(
+    g: Union[NativeFunctionsGroup, NativeFunctionsViewGroup, NativeFunction]
+) -> Iterator[None]:
+    if isinstance(g, NativeFunctionsGroup):
+        # By default, we associate all errors with structured native functions
+        # with the out variant.  In some cases, it might be better to have
+        # a more specific place to hang things; if so, use
+        # native_function_manager again on the inside
+        f = g.out
+    elif isinstance(g, NativeFunctionsViewGroup):
+        # We associate errors with the view operator
+        f = g.view
+    else:
+        f = g
+    with context(lambda: f"in native_functions.yaml line {f.loc}:\n  {f.func}"):
+        with local.parametrize(
+            use_const_ref_for_mutable_tensors=f.use_const_ref_for_mutable_tensors,
+            use_ilistref_for_tensor_lists=f.part_of_structured_group,
+        ):
+            yield
+
+
+# Given a function that operates on NativeFunction, wrap it into a new function
+# that sets some appropriate context managers for that native function.
+# YOU MUST WRAP FUNCTIONS IN THIS for calls to api modules to be sound
+# (you will get an error if we try to access the local variables without having
+# set them).
+def with_native_function(func: Callable[[F], T]) -> Callable[[F], T]:
+    @functools.wraps(func)
+    def wrapper(f: F) -> T:
+        with native_function_manager(f):
+            return func(f)
+
+    return wrapper
+
+
+def with_native_function_and(func: Callable[[F, F2], T]) -> Callable[[F, F2], T]:
+    @functools.wraps(func)
+    def wrapper(f: F, f2: F2) -> T:
+        # The first native_function is assumed to be the one with the appropriate context.
+        with native_function_manager(f):
+            return func(f, f2)
+
+    return wrapper
+
+
+def method_with_native_function(func: Callable[[S, F], T]) -> Callable[[S, F], T]:
+    @functools.wraps(func)
+    def wrapper(slf: S, f: F) -> T:
+        with native_function_manager(f):
+            return func(slf, f)
+
+    return wrapper
+
+
+def method_with_nested_native_function(
+    func: Callable[[S, F3], T]
+) -> Callable[[S, F3], T]:
+    @functools.wraps(func)
+    def wrapper(slf: S, f: F3) -> T:
+        with native_function_manager(f[0]):
+            return func(slf, f)
+
+    return wrapper
+
+
+# Convenience decorator for functions that explicitly take in a BackendIndex,
+# instead of indirectly taking one in as a closure
+def with_native_function_and_index(
+    func: Callable[[F, BackendIndex], T]
+) -> Callable[[F, BackendIndex], T]:
+    @functools.wraps(func)
+    def wrapper(f: F, backend_index: BackendIndex) -> T:
+        with native_function_manager(f):
+            return func(f, backend_index)
+
+    return wrapper
+
+
+# Convenience decorator for functions that explicitly take in a Dict of BackendIndices
+def with_native_function_and_indices(
+    func: Callable[[F, Dict[DispatchKey, BackendIndex]], T]
+) -> Callable[[F, Dict[DispatchKey, BackendIndex]], T]:
+    @functools.wraps(func)
+    def wrapper(f: F, backend_indices: Dict[DispatchKey, BackendIndex]) -> T:
+        with native_function_manager(f):
+            return func(f, backend_indices)
+
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torchgen/dest/__init__.py b/MLPY/Lib/site-packages/torchgen/dest/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..958b5a29017d9886efad778a413884248347ea7c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/dest/__init__.py
@@ -0,0 +1,19 @@
+from .lazy_ir import (
+    generate_non_native_lazy_ir_nodes as generate_non_native_lazy_ir_nodes,
+    GenLazyIR as GenLazyIR,
+    GenLazyNativeFuncDefinition as GenLazyNativeFuncDefinition,
+    GenLazyShapeInferenceDefinition as GenLazyShapeInferenceDefinition,
+)
+from .native_functions import (
+    compute_native_function_declaration as compute_native_function_declaration,
+)
+from .register_dispatch_key import (
+    gen_registration_headers as gen_registration_headers,
+    gen_registration_helpers as gen_registration_helpers,
+    RegisterDispatchKey as RegisterDispatchKey,
+)
+from .ufunc import (
+    compute_ufunc_cpu as compute_ufunc_cpu,
+    compute_ufunc_cpu_kernel as compute_ufunc_cpu_kernel,
+    compute_ufunc_cuda as compute_ufunc_cuda,
+)
diff --git a/MLPY/Lib/site-packages/torchgen/dest/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b5e0ce048ef8216fadba086cf42fb5781720c15
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/dest/__pycache__/lazy_ir.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/lazy_ir.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17d07e8e3eb35d147d0b9908ace144a7f2b97297
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/lazy_ir.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/dest/__pycache__/lazy_ts_lowering.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/lazy_ts_lowering.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1afe09868023ab018f1cf3176db22837b5cebdab
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/lazy_ts_lowering.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/dest/__pycache__/native_functions.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/native_functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7eb3d1c9dcce7e476952d40f3b9cb8435054a229
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/native_functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/dest/__pycache__/register_dispatch_key.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/register_dispatch_key.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..735b617f81556f6e2760451c9f240fb177ec409d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/register_dispatch_key.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/dest/__pycache__/ufunc.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/ufunc.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f392de98f02c4fe035a540cab53e602d394b780
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/dest/__pycache__/ufunc.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/dest/lazy_ir.py b/MLPY/Lib/site-packages/torchgen/dest/lazy_ir.py
new file mode 100644
index 0000000000000000000000000000000000000000..84a00001e5e5cd415bf835ef07e3621624c9d6ae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/dest/lazy_ir.py
@@ -0,0 +1,707 @@
+import itertools
+from abc import ABC
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torchgen.api.dispatcher as dispatcher
+from torchgen.api.lazy import (
+    getValueT,
+    isValueType,
+    LazyArgument,
+    LazyIrProperties,
+    LazyIrSchema,
+    tensorListValueT,
+)
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    deviceT,
+    DispatcherSignature,
+    kernel_signature,
+    NativeSignature,
+    OptionalCType,
+    VectorCType,
+)
+from torchgen.context import method_with_native_function
+from torchgen.dest.lazy_ts_lowering import ts_lowering_body
+from torchgen.model import (
+    Argument,
+    BackendIndex,
+    BackendMetadata,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    NativeFunctionsGroup,
+)
+
+
+def node_ctor_arg_rvalue_string(arg: LazyArgument) -> str:
+    """
+    Given a LazyArgument,
+    generate a c++ string for materializing an rvalue of that arg for passing into
+    a lazy Node constructor.
+    """
+
+    # TODO: Matching on CType seems wrong; should be matching on Type
+    if isValueType(arg.lazy_type):
+        if isinstance(arg.lazy_type, BaseCType):
+            if arg.is_wrapped_scalar:
+                return f"node_{arg.name}"
+            elif arg.lazy_type.type is tensorListValueT:
+                return f"lazy_{arg.name}_tensorlist"
+            elif arg.is_symint_or_list:
+                return f"GetSymIntValue({arg.name})"
+            return f"lazy_{arg.name}->GetIrValue()"
+        elif isinstance(arg.lazy_type, OptionalCType):
+            if arg.is_symint_or_list:
+                # TODO: I don't understand when you should put lazy_ in the name
+                # or not
+                return f"{arg.name} ? c10::make_optional(GetSymIntValue(*{arg.name})) : c10::nullopt"
+            elif arg.is_wrapped_scalar:
+                return f"node_{arg.name}"
+            return (
+                f"lazy_{arg.name} ? "
+                f"c10::make_optional(lazy_{arg.name}->GetIrValue()) : "
+                "c10::nullopt"
+            )
+        else:
+            raise AssertionError(
+                f"TODO not sure if there are other valid types to handle here ({arg.lazy_type})"
+            )
+    else:
+        # NB: this is here because right now we aren't treating SymInt[] as a
+        # value type; when we do this needs to move above
+        # NB: we cannot test arg.lazy_type as we've already specified it is an
+        # int64_t and so we cannot distinguish between SymInt and int64_t
+        if isinstance(arg.orig_type, ListType) and arg.orig_type.elem == BaseType(
+            BaseTy.SymInt
+        ):
+            if arg.symint:
+                return f"GetSymIntArrayRefValue({arg.name})"
+            else:
+                return f"std::vector<int64_t>({arg.name}.begin(), {arg.name}.end())"
+        elif isinstance(arg.lazy_type, VectorCType) and isinstance(
+            arg.lazy_type.elem, BaseCType
+        ):
+            return f"std::vector<{arg.lazy_type.elem.type}>({arg.name}.begin(), {arg.name}.end())"
+        elif (
+            isinstance(arg.lazy_type, OptionalCType)
+            and isinstance(arg.lazy_type.elem, VectorCType)
+            and isinstance(arg.lazy_type.elem.elem, BaseCType)
+        ):
+            return f"torch::lazy::ToOptionalVector<{arg.lazy_type.elem.elem.type}>({arg.name})"
+        else:
+            return f"{arg.name}"
+
+
+def node_ctor_inputs(schema: LazyIrSchema) -> str:
+    """
+    Produce a formatted string with the arguments as passed into the constructor of a node class.
+    """
+    node_ctor_values = [
+        node_ctor_arg_rvalue_string(arg) for arg in schema.filtered_args()
+    ]
+    return ", ".join(node_ctor_values)
+
+
+def gen_fallback_code(
+    schema: LazyIrSchema,
+    sig: Union[DispatcherSignature, NativeSignature],
+    overload_name: str,
+) -> str:
+    """
+    Generate code that falls back to eager conditioned on a predicate
+    """
+    dispatcher_sig = DispatcherSignature.from_schema(schema.func)
+    exprs = translate(sig.arguments(), dispatcher_sig.arguments())
+    fallback_args = ",\n                ".join([a.expr for a in exprs])
+    if len(overload_name):
+        aten_op_str = f"ATEN_OP2({schema.aten_name}, {overload_name})"
+    else:
+        aten_op_str = f"ATEN_OP({schema.aten_name})"
+    return f"""
+        if (force_eager_fallback({aten_symbol(schema)})) {{
+            return at::native::call_fallback_fn_symint<&ltc_eager_fallback, {aten_op_str}>::call(
+                {fallback_args}
+            );
+        }}
+"""
+
+
+def aten_symbol(schema: LazyIrSchema) -> str:
+    missing_interned_strings = {
+        "sigmoid_backward",
+    }
+    if schema.aten_name in missing_interned_strings:
+        return f'c10::Symbol::fromQualString("aten::{schema.aten_name}")'
+
+    if not schema.aten_name.startswith("at::"):
+        return f"at::aten::{schema.aten_name}"
+    else:
+        return schema.aten_name
+
+
+# converts  all tensor-like arguments to meta tensors. Returns:
+# (1) a string containing all of the logic that does the conversions.
+# (2) a context, to be used by translate(), with all of the relevant bindings.
+def convert_to_meta_tensors(sig: DispatcherSignature) -> Tuple[str, List[Binding]]:
+    context: List[Binding] = []
+    unwrapped_tensor_args: List[str] = []
+    for arg in sig.arguments():
+        if isinstance(arg.argument, Argument) and arg.argument.type.is_tensor_like():
+            unwrapped_name = f"{arg.name}_meta"
+            unwrapped_tensor_args.append(
+                f"auto {unwrapped_name} = to_meta({arg.name});"
+            )
+            context.append(arg.with_name(unwrapped_name))
+        else:
+            context.append(arg)
+    unwrap_tensor_args_str = "\n        ".join(unwrapped_tensor_args)
+    return unwrap_tensor_args_str, context
+
+
+@dataclass(frozen=True)
+class GenLazyIR(ABC):
+    backend_index: BackendIndex
+    backend_name: str
+    node_base: str
+    use_lazy_shape: bool
+
+    @method_with_native_function
+    def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
+        func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
+        metadata = self.backend_index.get_kernel(
+            f.functional if isinstance(f, NativeFunctionsGroup) else f
+        )
+        schema = LazyIrSchema(
+            func, symint=metadata is not None and metadata.supports_symint()
+        )
+        return self.gen(schema)
+
+    # there is no lowering functionality generated unless this IR base class is subclassed and
+    # implemented as a backend-specific node
+    def lowering_function(self, schema: LazyIrSchema) -> str:
+        return ""
+
+    def create_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str:
+        return ""
+
+    def can_be_reused_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str:
+        return f"""bool CanBeReused({node_ctor_args}) const {{
+    return false;
+    }}"""
+
+    def node_base_ctor_call(self, schema: LazyIrSchema) -> str:
+        value_args = schema.filtered_args(values=True, scalars=False)
+        # backends can customize the way the node base class constructor is called,
+        # as long as all of its arguments can be generated from information available from the schema
+        base_ctor_value_args_list = []
+        for arg in value_args:
+            if isinstance(arg.lazy_type, (BaseCType, VectorCType)):
+                base_ctor_value_args_list.append(f"{arg.name}")
+            elif isinstance(arg.lazy_type, OptionalCType):
+                base_ctor_value_args_list.append(f"{arg.name}.value_or(kNullValue)")
+            else:
+                raise AssertionError(
+                    f"Unsupported type ({arg.lazy_type}) - add support if necessary"
+                )
+        base_ctor_value_args = ", ".join(base_ctor_value_args_list)
+
+        scalar_args = schema.filtered_args(values=False, scalars=True)
+
+        # Shape construction.
+        # Conditionally build shape depending on specified shape property
+        if schema.properties.ShapePrecompute:
+            shape_ctor_arg = "std::move(shapes),"
+        elif schema.properties.ShapeCompute:
+            shape_args = [a.name for a in value_args]
+            shape_args.extend(a.name for a in scalar_args)
+            shape_ctor_arg = f"compute_shape_{schema.name}({', '.join(shape_args)}),"
+        elif schema.properties.ShapeCache:
+            shape_args = [f"operand({i})" for i in range(len(value_args))]
+            shape_args.extend(a.name for a in scalar_args)
+            shape_ctor_arg = f"[&](){{ return compute_shape_{schema.name}({', '.join(shape_args)})[0]; }},"
+        else:
+            shape_ctor_arg = ""
+
+        scalar_hashes = ", ".join(f"{a.name}" for a in scalar_args)
+
+        return f"""{self.node_base}(
+              {schema.node_name}::ClassOpKind(),
+              OpList{{{base_ctor_value_args}}},
+              {shape_ctor_arg}
+              /* num_outputs */ {len(schema.returns)},
+              torch::lazy::MHash({scalar_hashes}))"""
+
+    def gen(self, schema: LazyIrSchema) -> List[str]:
+        opkind = schema.opkind or aten_symbol(schema)
+
+        # for now, we just want one IR class decl and soon after also the method defs
+        # and we use the functional version not out/inplace.
+        all_args = schema.filtered_args()
+        value_args = schema.filtered_args(values=True, scalars=False)
+        scalar_args = schema.filtered_args(values=False, scalars=True)
+
+        ctor_args = [f"const {i.lazy_type.cpp_type()}& {i.name}" for i in all_args]
+        reuse_ctor_args = ", ".join(ctor_args)
+        if self.use_lazy_shape and schema.properties.ShapePrecompute:
+            ctor_args.append("std::vector<torch::lazy::Shape>&& shapes")
+        node_ctor_args = ", ".join(ctor_args)
+
+        scalar_initializers = ",\n        ".join(
+            [
+                # This code is just special casing the mapping from string_view -> strings
+                f"{a.name}({a.name}.has_value() ? c10::make_optional(std::string(*{a.name})) : c10::nullopt)"
+                if a.lazy_type.cpp_type() == "c10::optional<c10::string_view>"
+                else f"{a.name}({a.name})"
+                for a in scalar_args
+            ]
+        )
+        if len(scalar_initializers):
+            scalar_initializers = f",\n        {scalar_initializers}"
+        scalar_decls = "\n  ".join(
+            [
+                f"std::string {a.name};"
+                if a.lazy_type.cpp_type() == "c10::string_view"
+                else f"c10::optional<std::string> {a.name};"
+                if a.lazy_type.cpp_type() == "c10::optional<c10::string_view>"
+                else f"{a.lazy_type.cpp_type()} {a.name};"
+                for a in scalar_args
+            ]
+        )
+        optional_values = [
+            arg.name
+            for arg in schema.filtered_args(values=True, scalars=False)
+            if isinstance(arg.lazy_type, OptionalCType)
+        ]
+        has_optional_decls = "\n  ".join(
+            [f"bool has_{value}: 1;" for value in optional_values]
+        )
+        has_optional_defs = "\n    ".join(
+            [f"has_{value} = !!{value};" for value in optional_values]
+        )
+        members_to_string = []
+        for arg in scalar_args:
+            if isinstance(arg.lazy_type, OptionalCType):
+                value = f"{arg.name}.value()"
+                if arg.is_generator:
+                    value = '"torch.Generator()"'
+                members_to_string.append(
+                    f"""if ({arg.name}.has_value()) {{
+      ss << ", {arg.name}=" << {value};
+    }} else {{
+      ss << ", {arg.name}=null";
+    }}"""
+                )
+            else:
+                members_to_string.append(f'ss << ", {arg.name}=" << {arg.name};')
+        members_to_string_str = "\n    ".join(members_to_string)
+
+        return [
+            f"""\
+class {schema.node_name} : public {self.node_base} {{
+ public:
+  static torch::lazy::OpKind ClassOpKind() {{
+    return torch::lazy::OpKind({opkind});
+  }}
+
+  {schema.node_name}({node_ctor_args})
+      : {self.node_base_ctor_call(schema)}{scalar_initializers}
+  {{
+    {has_optional_defs}
+  }}
+
+  std::string ToString() const override {{
+    std::stringstream ss;
+    ss << {self.node_base}::ToString();
+    {members_to_string_str}
+    return ss.str();
+  }}
+
+  {self.create_function(schema, reuse_ctor_args)}
+
+  {self.can_be_reused_function(schema, reuse_ctor_args)}
+
+  {self.lowering_function(schema)}
+
+  {scalar_decls}
+  {has_optional_decls}
+
+}};
+
+""",
+        ]
+
+
+@dataclass(frozen=True)
+class GenTSLazyIR(GenLazyIR):
+    def lowering_function(self, schema: LazyIrSchema) -> str:
+        signature = """
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override"""
+
+        if schema.properties.LowerDeclOnly:
+            return f"{signature};"
+        elif schema.properties.Lower:
+            return f"""{signature} {{
+    {ts_lowering_body(schema)}
+  }}
+            """
+        else:
+            return ""
+
+    def create_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str:
+        signature = f"static NodePtr Create({node_ctor_args})"
+        if schema.properties.CreateFnDeclOnly:
+            return f"{signature};"
+        elif not schema.properties.CreateFn:
+            return ""
+        return f"""{signature} {{
+    return ReuseOrMakeNode<{schema.node_name}>(data);
+  }}"""
+
+    def can_be_reused_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str:
+        signature = f"bool CanBeReused({node_ctor_args}) const"
+        if schema.properties.CanBeReusedDeclOnly:
+            return f"{signature};"
+        elif not schema.properties.CanBeReused:
+            return ""
+        value_comparison = []
+        for arg in itertools.chain(schema.positional_values, schema.keyword_values):
+            if isinstance(arg.lazy_type, OptionalCType):
+                value_comparison.append(
+                    f"nullable_operand(i++) == {arg.name}.value_or(kNullValue)"
+                )
+            else:
+                value_comparison.append(f"operand(i++) == {arg.name}")
+        for arg in itertools.chain(schema.positional_scalars, schema.keyword_scalars):
+            if isinstance(arg.lazy_type, OptionalCType):
+                value_comparison.append(
+                    f"((!this->{arg.name}&&!{arg.name}) || (this->{arg.name}&&{arg.name} && *(this->{arg.name}) == *{arg.name}))"
+                )
+            else:
+                value_comparison.append(f"this->{arg.name} == {arg.name}")
+        value_comparison_str = " &&\n        ".join(value_comparison)
+
+        return f"""{signature} {{
+    size_t i = 0;
+    return ({value_comparison_str});
+  }}"""
+
+
+@dataclass(frozen=True)
+class GenLazyNativeFuncDefinition:
+    class_method_name: str
+    backend_index: BackendIndex
+    tensor_class: str
+    gen_forced_fallback_code: bool
+    backend_namespace: str
+    get_tensorlist: str
+    get_tensor_or_wrap_number: str
+    try_get_tensor: str
+    metrics_counter: str
+    create_tensor: str
+    create_from_first_tensor: bool
+    create_aten_from_ltc_tensor: str
+    tuple_aten_from_ltc_tensors: str
+    lazy_tensor_ptr: str
+    get_device_fn: str
+
+    def lazy_tensor_decls(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        value_args = schema.filtered_args(values=True, scalars=False)
+        # Generates lazy_{name} variables for LazyTensors wrapping input tensors
+        lazy_tensor_decls: List[str] = []
+        for arg in value_args:
+            if arg.is_wrapped_scalar:
+                if isinstance(arg.lazy_type, OptionalCType):
+                    lazy_tensor_decls.append(
+                        f"""auto node_{arg.name} = {arg.name} ?
+                c10::make_optional(torch::lazy::LazyGraphExecutor::Get()->
+                    GetIrValueForScalarFromCodegen(*{arg.name}, *common_device)):
+                c10::nullopt;"""
+                    )
+                else:
+                    lazy_tensor_decls.append(
+                        f"""auto node_{arg.name} = torch::lazy::LazyGraphExecutor::Get()->
+                            GetIrValueForScalarFromCodegen({arg.name}, *common_device);"""
+                    )
+            elif arg.is_symint_or_list:
+                continue  # values are extracted in isValueType
+            elif isinstance(arg.lazy_type, BaseCType):
+                if arg.lazy_type.type is tensorListValueT:
+                    lazy_tensor_decls.append(
+                        f"auto lazy_{arg.name}_tensorlist = "
+                        f"{self.backend_namespace}::{self.get_tensorlist}({arg.name});"
+                    )
+                else:
+                    lazy_tensor_decls.append(
+                        f"{self.lazy_tensor_ptr} lazy_{arg.name} = "
+                        f"{self.backend_namespace}::{self.get_tensor_or_wrap_number}({arg.name}, *common_device);"
+                    )
+            elif isinstance(arg.lazy_type, OptionalCType):
+                assert arg.lazy_type.elem == BaseCType(getValueT()), arg.lazy_type.elem
+                # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it
+                # until we encounter a real world example.
+                lazy_tensor_decls.append(
+                    f"{self.lazy_tensor_ptr} lazy_{arg.name} = "
+                    f"{self.backend_namespace}::{self.try_get_tensor}({arg.name}.value_or(at::Tensor()));"
+                )
+            else:
+                raise AssertionError(
+                    f"TODO not sure if there are other valid types to handle here ({arg.lazy_type})"
+                )
+        return ("\n        ").join(lazy_tensor_decls)
+
+    def force_eager_fallback(
+        self,
+        func: NativeFunction,
+        schema: LazyIrSchema,
+        metadata: BackendMetadata,
+        sig: Union[DispatcherSignature, NativeSignature],
+    ) -> str:
+        if self.gen_forced_fallback_code:
+            return gen_fallback_code(
+                schema, sig, overload_name=func.func.name.overload_name
+            )
+        return ""
+
+    def metrics(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        return f"{self.metrics_counter};"
+
+    def get_device(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        value_args = schema.filtered_args(values=True, scalars=False)
+        scalar_args = schema.filtered_args(values=False, scalars=True)
+        value_types_names = [f"{a.name}" for a in value_args if not a.is_wrapped_scalar]
+        optional_device = OptionalCType(BaseCType(deviceT))
+        optional_devices = [
+            a.name for a in scalar_args if a.lazy_type == optional_device
+        ]
+        assert (
+            len(value_types_names) > 0 or len(optional_devices) > 0
+        ), "Expected at least one Value or Device type"
+        get_device_str = (
+            f"{self.get_device_fn}({', '.join(value_types_names + optional_devices)})"
+        )
+        return f"""auto common_device = {get_device_str};
+        TORCH_INTERNAL_ASSERT(common_device);
+        """
+
+    def shape_inference(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        metadata = self.backend_index.get_kernel(func)
+        assert metadata is not None
+        all_args = schema.filtered_args()
+        returns_length = len(schema.returns)
+        # call the meta kernel if it exists, to compute output shape/dtype for our IR
+        # Note [Generated LTC Shape Functions]
+        # LTC uses meta tensors from core to do shape inference when possible, and otherwise
+        # we generate a shape function declaration that needs to be manually implemented.
+        # How do we detect which ops are eligible to use meta tensors?
+        # In general we should be able to use meta tensors not just on structured operators,
+        # but also on composite operators that are implemented in terms of structured kernels.
+        # We don't currently have a way of knowing at codegen time which ops are implemented that way.
+        # This is the case for all view and view_copy operators however, so we're going to
+        # use them specifically for all of the view_copy ops (instead of manually writing shape rules for all of them).
+        is_view_copy_op = "view_copy" in func.tags
+        is_structured = func.structured or func.structured_delegate is not None
+        if is_structured or is_view_copy_op:
+            meta_out = """
+std::vector<torch::lazy::Shape> shapes{torch::lazy::Shape(out_meta.scalar_type(), out_meta.sizes().vec())};"""
+            if returns_length > 1:
+
+                def this_shape(i: int) -> str:
+                    return f"torch::lazy::Shape(std::get<{i}>(out_meta).scalar_type(), std::get<{i}>(out_meta).sizes().vec())"
+
+                shapes_str = ",".join([this_shape(i) for i in range(returns_length)])
+                meta_out = "std::vector<torch::lazy::Shape> shapes{" + shapes_str + "};"
+
+            # Convert tensor args to the meta device and call it.
+            # (We can't pass in the input tensors directly, because they are "functional wrappers".
+            # If any of the meta kernels call a tensor op and redispatch, we don't want to hit the functionalize kernels.)
+            # Even at::meta:: functions might redispatch, e.g. if they call into view ops.
+            dispatcher_sig = DispatcherSignature.from_schema(func.func)
+            meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
+            meta_call_args = [
+                e.expr
+                for e in translate(
+                    meta_call_ctx, dispatcher_sig.arguments(), method=False
+                )
+            ]
+            if is_view_copy_op:
+                # view_copy ops always have a CompositeExplicitAutogradNonFunctional kernel
+                assert func.has_composite_explicit_autograd_non_functional_kernel
+                dispatch_ns = "compositeexplicitautogradnonfunctional"
+            else:
+                dispatch_ns = "meta"
+            aten_name = schema.aten_name
+            # TODO: this is trolling
+            if func.func.has_symint() and metadata.supports_symint():
+                aten_name += "_symint"
+            shape_str = f"""\
+        {meta_conversion_str}
+        auto out_meta = at::{dispatch_ns}::{aten_name}({', '.join(meta_call_args)});
+        {meta_out}"""
+        else:
+            shape_sig = ComputeShapeSignature(
+                metadata.kernel, func, symint=metadata.supports_symint()
+            )
+            shape_str = f"""
+            auto shapes = {shape_sig.shape_call};"""
+
+        shape_str += f"""
+            TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});"""
+
+        # Calculating which dimensions are symbolic
+        func_schema_str = "aten::" + str(func.func)
+        shape_str += f"""
+            if(torch::lazy::symbolicShapeEnabled()){{
+                std::vector<torch::jit::IValue> inputs = {{ {', '.join(str(a.name) for a in all_args)} }};
+                const char* schema_str = "{func_schema_str}";
+                applySymbolicShapesOnLT(schema_str, inputs, shapes);
+            }}
+        """
+        return shape_str
+
+    def build_ir_node(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        node_ctor_input_str = node_ctor_inputs(schema)
+        return f"""torch::lazy::NodePtr node = torch::lazy::ReuseNode<{schema.node_name}>({node_ctor_input_str});
+        if (!node) {{
+            {self.shape_inference(func, schema)}
+            node = torch::lazy::MakeNode<{schema.node_name}>({node_ctor_input_str}, std::move(shapes));
+            CacheNode(node);
+        }}
+        """
+
+    def create_lazy_tensor(self, first_tensor_name: Optional[str] = None) -> str:
+        # xla uses an instance method for tensor creation, for the time being
+        if self.create_from_first_tensor:
+            # TODO(whc) remove this if XLA switches to using static method for creation
+            assert (
+                first_tensor_name is not None
+            ), "Requires first tensor to create lazy tensor"
+            return f"{first_tensor_name}.{self.create_tensor}"
+        return f"{self.backend_namespace}::{self.create_tensor}"
+
+    def return_aten_tensor(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        returns_length = len(schema.returns)
+        value_args = schema.filtered_args(values=True, scalars=False)
+        value_types_names = [f"{a.name}" for a in value_args if not a.is_wrapped_scalar]
+        first_tensor_name = value_types_names[0] if len(value_types_names) > 0 else None
+        bridge_str = f"""auto result = {self.create_aten_from_ltc_tensor}(
+                {self.create_lazy_tensor(first_tensor_name)}(std::move(node), *common_device));"""
+
+        if returns_length > 1:
+            assert (
+                len(value_types_names) > 0
+            ), "Code below assumes there is at least one tensor arg"
+            bridge_str = f"""std::vector<{self.lazy_tensor_ptr}> lazy_tensors;
+        for (int i = 0; i < {returns_length}; i++) {{
+            lazy_tensors.push_back({self.create_lazy_tensor(first_tensor_name)}({getValueT()}(node, i), *common_device));
+        }}
+        auto result = {self.tuple_aten_from_ltc_tensors}<{returns_length}>(lazy_tensors);"""
+
+        if schema.name.name.inplace or func.func.is_out_fn():
+            assert returns_length == 1, (
+                "We assumed there was no such case where an op is an in-place variant "
+                f"and has tuple outputs, but got tuple of len {returns_length}."
+            )
+            bridge_str = f"""lazy_{first_tensor_name}->SetInPlaceIrValue(node);
+        auto& result = {first_tensor_name};"""
+
+        bridge_str += """
+        return result;"""
+        return bridge_str
+
+    @method_with_native_function
+    def __call__(self, func: NativeFunction) -> List[str]:
+        sig = kernel_signature(func, self.backend_index)
+        metadata = self.backend_index.get_kernel(func)
+        assert metadata is not None
+        schema = LazyIrSchema(func.func, symint=metadata.supports_symint())
+        return [
+            f"""\
+    {sig.decl(name=f"{self.class_method_name}::{metadata.kernel}")} {{
+        {self.force_eager_fallback(func, schema, metadata, sig)}
+        {self.metrics(func, schema)}
+        {self.get_device(func, schema)}
+        {self.lazy_tensor_decls(func, schema)}
+        {self.build_ir_node(func, schema)}
+        {self.return_aten_tensor(func, schema)}
+    }}\n
+    """
+        ]
+
+
+class ComputeShapeSignature:
+    """
+    Here we use the base name as the suffix of the signature to avoid generating for in-place variants.
+    """
+
+    def __init__(self, kernel_name: str, f: NativeFunction, *, symint: bool):
+        self.__schema = LazyIrSchema(f.func, symint=symint)
+        self.__dispatch_args = ", ".join(
+            [a.decl() for a in dispatcher.arguments(f.func, symint=symint)]
+        )
+        self.__call_args = ", ".join(
+            [f"{arg.name}" for arg in self.__schema.filtered_args(generator=True)]
+        )
+        self.__kernel_name = kernel_name
+
+    def __decl_suffix(self) -> str:
+        return f"{self.__kernel_name}({self.__dispatch_args})"
+
+    def __call_suffix(self) -> str:
+        return f"{self.__kernel_name}({self.__call_args})"
+
+    @property
+    def shape_decl(self) -> str:
+        return f"TORCH_API std::vector<torch::lazy::Shape> compute_shape_{self.__decl_suffix()}"
+
+    @property
+    def shape_call(self) -> str:
+        return f"torch::lazy::compute_shape_{self.__call_suffix()}"
+
+
+@dataclass(frozen=True)
+class GenLazyShapeInferenceDefinition:
+    backend_index: BackendIndex
+    tensor_class: str
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> List[str]:
+        sig = kernel_signature(f, self.backend_index)
+        metadata = self.backend_index.get_kernel(f)
+        assert metadata is not None
+
+        # See Note [Generated LTC Shape Functions]
+        is_view_copy_op = "view_copy" in f.tags
+        is_structured = f.structured or f.structured_delegate is not None
+        if is_structured or is_view_copy_op:
+            return []
+        else:
+            shape_sig = ComputeShapeSignature(
+                metadata.kernel, f, symint=metadata.supports_symint()
+            )
+            return ["\n".join([f"{shape_sig.shape_decl};"])]
+
+
+def generate_non_native_lazy_ir_nodes(
+    non_native: List[Dict[str, Any]], gen_lazy_ir: GenLazyIR
+) -> List[str]:
+    """Generate the non-native lazy IR node classes"""
+    nodes = []
+    for op in non_native:
+        # Set default properties for Non-Native IRs
+        properties = LazyIrProperties("ShapeCache", "CanBeReused", "LowerDeclOnly")
+        for p in op.get("properties", []):
+            setattr(properties, p, True)
+
+        # non-native is assumed to want symint bindings if you wrote symint
+        schema = LazyIrSchema(FunctionSchema.parse(op["func"]), properties, symint=True)
+        schema.opkind = op.get("opkind")
+        nodes.append(gen_lazy_ir.gen(schema)[0])
+
+    return nodes
diff --git a/MLPY/Lib/site-packages/torchgen/dest/lazy_ts_lowering.py b/MLPY/Lib/site-packages/torchgen/dest/lazy_ts_lowering.py
new file mode 100644
index 0000000000000000000000000000000000000000..1efbd63d7e7722d39c314afdf5474f80a5994c28
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/dest/lazy_ts_lowering.py
@@ -0,0 +1,48 @@
+from torchgen.api.lazy import LazyArgument, LazyIrSchema
+from torchgen.api.types import OptionalCType
+
+
+def ts_lowering_body(schema: LazyIrSchema) -> str:
+    # for now, we just want one IR class decl and soon after also the method defs
+    # and we use the functional version not out/inplace.
+    emplace_arguments = []
+
+    def get_value(arg: LazyArgument) -> str:
+        if isinstance(arg.lazy_type, OptionalCType):
+            return f"has_{arg.name} ? loctx->GetOutputOp(operand(i++)) : nullptr"
+        return "loctx->GetOutputOp(operand(i++))"
+
+    for arg in schema.positional_args:
+        if arg.is_lazy_value:
+            emplace_arguments.append(get_value(arg))
+            continue
+        emplace_arguments.append(f'"{arg.name}", {arg.name}')
+
+    emplace_arguments_str = "\n    ".join(
+        [f"arguments.emplace_back({a});" for a in emplace_arguments]
+    )
+    emplace_kwarg_values = [
+        f'"{arg.name}", {get_value(arg)}' for arg in schema.keyword_values
+    ]
+    emplace_kwarg_scalars = [
+        f'"{arg.name}", {arg.name}' for arg in schema.keyword_scalars
+    ]
+    emplace_kwarguments = "\n    ".join(
+        [
+            f"kwarguments.emplace_back({a});"
+            for a in emplace_kwarg_values + emplace_kwarg_scalars
+        ]
+    )
+    return f"""\
+    std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve({len(emplace_arguments)});
+    kwarguments.reserve({len(emplace_kwarg_values + emplace_kwarg_scalars)});
+    size_t i = 0;
+    {emplace_arguments_str}
+    {emplace_kwarguments}
+    torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ({schema.aten_name}_out.size(), {len(schema.returns)});
+
+    return {schema.aten_name}_out;
+"""
diff --git a/MLPY/Lib/site-packages/torchgen/dest/native_functions.py b/MLPY/Lib/site-packages/torchgen/dest/native_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dd57cc1a839680b2b8cab10ef80259d5282ee82
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/dest/native_functions.py
@@ -0,0 +1,64 @@
+from typing import List, Optional, Union
+
+import torchgen.api.meta as meta
+import torchgen.api.structured as structured
+from torchgen.api.types import kernel_signature
+
+from torchgen.context import with_native_function_and_index
+from torchgen.model import BackendIndex, NativeFunction, NativeFunctionsGroup
+from torchgen.utils import mapMaybe
+
+
+@with_native_function_and_index
+def gen_unstructured(f: NativeFunction, backend_index: BackendIndex) -> Optional[str]:
+    sig = kernel_signature(f, backend_index)
+    metadata = backend_index.get_kernel(f)
+    if metadata is None:
+        return None
+    if "legacy::" in metadata.kernel:
+        return None
+    else:
+        prefix = "static" if backend_index.external else "TORCH_API"
+        return f"{prefix} {sig.decl(name=metadata.kernel)};"
+
+
+@with_native_function_and_index
+def gen_structured(g: NativeFunctionsGroup, backend_index: BackendIndex) -> List[str]:
+    meta_name = meta.name(g)
+    out_args = structured.impl_arguments(g)
+    metadata = backend_index.get_kernel(g)
+    if metadata is None:
+        return []
+    prefix = "" if backend_index.external else "TORCH_API "
+    return [
+        f"""\
+struct {prefix}structured_{metadata.kernel} : public at::meta::structured_{meta_name} {{
+void impl({', '.join(a.decl() for a in out_args)});
+}};
+"""
+    ]
+
+
+# Generates NativeFunctions.h, a list of forward declarations of all
+# actual kernel definitions we keep in aten/src/ATen/native/
+@with_native_function_and_index
+def compute_native_function_declaration(
+    g: Union[NativeFunctionsGroup, NativeFunction], backend_index: BackendIndex
+) -> List[str]:
+    metadata = backend_index.get_kernel(g)
+    if isinstance(g, NativeFunctionsGroup):
+        if metadata is not None and metadata.structured:
+            if backend_index.external:
+                # Structured hasn't been tested with external backends yet.
+                raise AssertionError(
+                    "Structured external backend functions are not implemented yet."
+                )
+            else:
+                return gen_structured(g, backend_index)
+        else:
+            return list(
+                mapMaybe(lambda f: gen_unstructured(f, backend_index), g.functions())
+            )
+    else:
+        x = gen_unstructured(g, backend_index)
+        return [] if x is None else [x]
diff --git a/MLPY/Lib/site-packages/torchgen/dest/register_dispatch_key.py b/MLPY/Lib/site-packages/torchgen/dest/register_dispatch_key.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d7260bd925e7e7cf6902b0572e66877c355bd61
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/dest/register_dispatch_key.py
@@ -0,0 +1,989 @@
+import itertools
+import textwrap
+from dataclasses import dataclass
+from typing import List, Literal, Optional, Tuple, Union
+
+import torchgen.api.cpp as cpp
+import torchgen.api.meta as meta
+import torchgen.api.structured as structured
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    CppSignature,
+    CppSignatureGroup,
+    DispatcherSignature,
+    Expr,
+    kernel_signature,
+    MutRefCType,
+    NamedCType,
+    NativeSignature,
+    tensorT,
+)
+
+from torchgen.context import method_with_native_function, native_function_manager
+from torchgen.model import (
+    Argument,
+    BackendIndex,
+    DeviceCheckType,
+    DispatchKey,
+    gets_generated_out_inplace_wrapper,
+    is_cuda_dispatch_key,
+    NativeFunction,
+    NativeFunctionsGroup,
+    SchemaKind,
+    TensorOptionsArguments,
+)
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import assert_never, mapMaybe, Target
+
+
+def gen_registration_headers(
+    backend_index: BackendIndex,
+    per_operator_headers: bool,
+    rocm: bool,
+) -> List[str]:
+    if per_operator_headers:
+        headers = ["#include <ATen/ops/as_strided_native.h>"]
+    else:
+        headers = ["#include <ATen/NativeFunctions.h>"]
+
+    if backend_index.dispatch_key in (DispatchKey.CPU, DispatchKey.Meta):
+        headers.append("#include <ATen/EmptyTensor.h>")
+    elif backend_index.dispatch_key == DispatchKey.CUDA:
+        if rocm:
+            headers.append("#include <ATen/hip/EmptyTensor.h>")
+        else:
+            headers.append("#include <ATen/cuda/EmptyTensor.h>")
+    elif backend_index.dispatch_key == DispatchKey.MPS:
+        headers.append("#include <ATen/mps/EmptyTensor.h>")
+    elif per_operator_headers:
+        headers += [
+            "#include <ATen/ops/empty.h>",
+            "#include <ATen/ops/empty_strided.h>",
+            "#include <ATen/ops/_copy_from_and_resize.h>",
+            "#include <ATen/ops/_copy_from.h>",
+        ]
+    else:
+        headers.append("#include <ATen/Functions.h>")
+
+    return headers
+
+
+def gen_empty_impl_names(
+    backend_index: BackendIndex,
+) -> Tuple[Optional[str], Optional[str]]:
+    empty_impl = None
+    empty_strided_impl = None
+
+    if backend_index.dispatch_key in (
+        DispatchKey.Meta,
+        DispatchKey.CPU,
+        DispatchKey.CUDA,
+        DispatchKey.MPS,
+    ):
+        dispatch = str(backend_index.dispatch_key).lower()
+        empty_impl = f"at::detail::empty_{dispatch}"
+        empty_strided_impl = f"at::detail::empty_strided_{dispatch}"
+    elif backend_index.dispatch_key in (
+        DispatchKey.CompositeExplicitAutogradNonFunctional,
+        DispatchKey.QuantizedCPU,
+        DispatchKey.QuantizedCUDA,
+    ):
+        empty_impl = "at::empty"
+        empty_strided_impl = "at::empty_strided"
+
+    return empty_impl, empty_strided_impl
+
+
+def gen_create_out_helper(backend_index: BackendIndex) -> List[str]:
+    if backend_index.dispatch_key == DispatchKey.Meta:
+        empty_options = "options.device(at::kMeta)"
+    else:
+        empty_options = "options"
+
+    empty_impl, empty_strided_impl = gen_empty_impl_names(backend_index)
+    if empty_impl is None:
+        return []
+
+    return [
+        f"""
+Tensor create_out(IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{
+  if (strides.empty()) {{
+      return {empty_impl}(sizes, {empty_options});
+  }} else {{
+      return {empty_strided_impl}(sizes, strides, {empty_options});
+  }}
+}}
+"""
+    ]
+
+
+def gen_maybe_create_proxy_helper(backend_index: BackendIndex) -> List[str]:
+    _, empty_strided_impl = gen_empty_impl_names(backend_index)
+    return (
+        []
+        if empty_strided_impl is None
+        else [
+            f"""
+c10::optional<Tensor> maybe_create_proxy(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{
+  if (out.strides() != strides) {{
+    return {empty_strided_impl}(sizes, strides, options);
+  }}
+  return c10::nullopt;
+}}
+"""
+        ]
+    )
+
+
+def gen_resize_out_helper(backend_index: BackendIndex) -> List[str]:
+    if backend_index.dispatch_key == DispatchKey.CompositeExplicitAutogradNonFunctional:
+        # The function isn't used by this key (since only functional ops have a kernel for this key),
+        # so we need to not include it to avoid a defined-but-not-used error.
+        return []
+    return [
+        """
+void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {
+  TORCH_CHECK(options.dtype() == out.dtype(),
+      "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead");
+  TORCH_CHECK(options.device() == out.device(),
+      "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead");
+  const bool resized = at::native::resize_output(out, sizes);
+  // Only restride if a resize occurred; otherwise we ignore the (advisory)
+  // strides from the meta function and directly use the output tensor's
+  // preexisting strides
+  if (resized) {
+    if (!strides.empty()) {
+      TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
+      // TODO: avoid the redispatch here
+      out.as_strided_(sizes, strides);
+    } else if (options.memory_format_opt().has_value()) {
+      out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
+    }
+  }
+}
+"""
+    ]
+
+
+def gen_check_inplace_helper(backend_index: BackendIndex) -> List[str]:
+    return [
+        """
+void check_inplace(const Tensor &self, IntArrayRef sizes, const TensorOptions &options) {
+  // These checks are needed on those operators that:
+  //   1) don't use 'TensorIterator' (e.g. 'addmm' and 'baddbmm')
+  //   2) have particular typing rules (e.g. 'cumsum' and 'cumprod')
+  // For other operators (e.g. 'add'), 'TensorIterator' already checks
+  // these things separately.
+  TORCH_CHECK(options.dtype() == self.dtype(),
+      "Bad in-place call: ",
+      "input tensor dtype ", self.dtype(), " and output tensor dtype ", options.dtype(), " should match");
+  TORCH_CHECK(options.device() == self.device(),
+      "Bad in-place call: ",
+      "input tensor device ", self.device(), " and output tensor device ", options.device(), " should match");
+  TORCH_CHECK(sizes == self.sizes(),
+      "Bad in-place call: ",
+      "input tensor size ", self.sizes(), " and output tensor size ", sizes, " should match");
+}
+"""
+    ]
+
+
+def gen_registration_helpers(backend_index: BackendIndex) -> List[str]:
+    return [
+        *gen_create_out_helper(backend_index),
+        *gen_resize_out_helper(backend_index),
+        *gen_check_inplace_helper(backend_index),
+        *gen_maybe_create_proxy_helper(backend_index),
+    ]
+
+
+# Generates Register{dispatch}.cpp (e.g., RegisterCPU.cpp).
+#
+#   - The primary function of this file is to register all of the
+#     implementations for the given dispatch key to the dispatcher,
+#     so they are available for use in PyTorch.  If dispatch is
+#     None, we generate schema (def) registrations and catchall
+#     registrations.
+#   - The secondary function of this file is to generate a wrapper
+#     around functions.  In CPUType these wrappers do nothing
+#     (and should be removed), but in other cases they handle
+#     DeviceGuard. A small extra benefit of wrappers is they
+#     are not overloaded, so they can be used in the registration
+#     API without having to disambiguate which overload you want
+#     (as would be the case if you directly registered native::
+#     functions).
+#   - The tertiary function of this file is to generate *static*
+#     cpp API bindings which can be used to bypass dispatcher
+#     directly to kernels, but with user-friendly cpp-style API
+@dataclass(frozen=True)
+class RegisterDispatchKey:
+    backend_index: BackendIndex
+
+    target: Literal[
+        Target.ANONYMOUS_DEFINITION,
+        Target.NAMESPACED_DEFINITION,
+        Target.NAMESPACED_DECLARATION,
+        Target.REGISTRATION,
+    ]
+
+    # Selector object to determine which operators to generate
+    # registration code for.
+    selector: SelectiveBuilder
+
+    # Whether or not we are actually code-genning for ROCm
+    rocm: bool
+
+    # Whether or not to generate symint registrations or not.  External users
+    # of codegen who don't care about symints can set this to false to get
+    # non-SymInt codegen
+    symint: bool
+
+    # The class that all unstructured native functions live under. This is used to improve
+    # compiler error messages when a kernel writer adds a native function with the wrong signature.
+    # This is only used in unstructured kernels, since structured kernels already live in a class.
+    # Finally, this field is currently Optional because it is only used by external backends.
+    # It would be nice if we can add the same logic to in-tree kernels too, but that requires updating
+    # all of the existing kernel signatures scattered across aten/src/ATen/native.
+    class_method_name: Optional[str]
+
+    # Only set to true in lightweight dispatch. If lightweight dispatch is enabled we are registering
+    # operators into JIT op registry, thus we need to avoid generating code to register into the dispatcher.
+    skip_dispatcher_op_registration: bool
+
+    @staticmethod
+    def gen_device_check(
+        type: DeviceCheckType, args: List[Argument], method_name: str
+    ) -> str:
+        if type == DeviceCheckType.NoCheck:
+            return "  // No device check\n"
+
+        device_check = "c10::optional<Device> common_device = nullopt;\n"
+        device_check += "(void)common_device; // Suppress unused variable warning\n"
+        for arg in args:
+            # Only tensor like arguments are eligible
+            if arg.type.is_tensor_like():
+                device_check += f"""
+  c10::impl::check_and_update_common_device(common_device, {arg.name}, "{method_name}", "{arg.name}");"""
+        return device_check
+
+    @method_with_native_function
+    def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
+        if isinstance(f, NativeFunctionsGroup):
+            g: NativeFunctionsGroup = f
+            # Note: We call gen_structured() if the operator is marked structured, regardless of the backend.
+            # gen_structured() has special logic to handle auto-generated kernels.
+            if g.structured:
+                return self.gen_structured(g)
+            else:
+                return list(
+                    mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions())
+                )
+        elif isinstance(f, NativeFunction):
+            r = self.gen_unstructured(f)
+            return [] if r is None else [r]
+        else:
+            assert_never(f)
+
+    def wrapper_kernel_sig(
+        self, f: NativeFunction
+    ) -> Union[NativeSignature, DispatcherSignature]:
+        # The prefix is just to ensure uniqueness. The Dispatcher API doesn't guarantee unique kernel names.
+        return DispatcherSignature.from_schema(
+            f.func,
+            prefix=f"wrapper_{self.backend_index.dispatch_key}_{f.func.name.overload_name}_",
+            symint=self.symint,
+        )
+
+    def gen_out_inplace_wrapper(
+        self, f: NativeFunction, g: Optional[NativeFunctionsGroup]
+    ) -> Optional[str]:
+        if g is None:
+            return None
+        k = f.func.kind()
+        if k is SchemaKind.inplace:
+            copy_op = "at::_copy_from"
+        elif k is SchemaKind.out:
+            copy_op = "at::_copy_from_and_resize"
+        else:
+            raise AssertionError("gen_out_inplace_wrapper called on a functional op")
+
+        sig = self.wrapper_kernel_sig(f)
+        name = sig.name()
+
+        func_res = f"{name}_tmp"
+        return_names = cpp.return_names(f)
+        if len(return_names) > 1:
+            updates = "\n  ".join(
+                f"{copy_op}(std::get<{i}>({func_res}), {ret_name});"
+                for i, ret_name in enumerate(return_names)
+            )
+            returns = f'{sig.returns_type().cpp_type()}({", ".join(return_names)})'
+        elif len(return_names) == 1:
+            ret_name = return_names[0]
+            updates = f"{copy_op}({func_res}, {ret_name});"
+            returns = ret_name
+        else:
+            assert len(f.func.arguments.out) == 1
+            returns = ""
+            out_arg = f.func.arguments.out[0]
+            if out_arg.type.is_list_like():
+                updates = f"""\
+    for (int64_t i = 0; i < {func_res}.size(); ++i) {{
+        {copy_op}({func_res}[i], {out_arg.name}[i]);
+    }}"""
+            else:
+                updates = f"{copy_op}({func_res}, {out_arg.name});"
+
+        functional_sig = self.wrapper_kernel_sig(g.functional)
+        wrapper_name = sig.name()
+
+        return f"""\
+{sig.defn(name=wrapper_name)} {{
+  auto {func_res} = {functional_sig.name()}({", ".join(e.expr for e in translate(sig.arguments(), functional_sig.arguments()))});
+  {updates}
+  return {returns};
+}}
+"""
+
+    def gen_structured(self, g: NativeFunctionsGroup) -> List[str]:
+        metadata = self.backend_index.get_kernel(g)
+        if self.backend_index.dispatch_key == DispatchKey.Meta:
+            assert not self.backend_index.has_kernel(g.out), (
+                "Do not explicitly specify Meta dispatch key on structured "
+                "functions, they will be automatically generated for you"
+            )
+        elif (
+            self.backend_index.dispatch_key
+            == DispatchKey.CompositeExplicitAutogradNonFunctional
+        ):
+            assert not self.backend_index.has_kernel(g.out), (
+                "Do not explicitly specify CompositeExplicitAutograd dispatch key on structured "
+                "functions, they will be automatically generated for you"
+            )
+        elif metadata is None or not metadata.structured:
+            return list(mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions()))
+        structured_gen = StructuredRegisterDispatchKey(
+            self.backend_index,
+            self.target,
+            self.selector,
+            self.rocm,
+            self.symint,
+            self.class_method_name,
+            self.skip_dispatcher_op_registration,
+            g,
+        )
+        return list(mapMaybe(structured_gen.gen_one, g.functions()))
+
+    def gen_unstructured(
+        self, f: NativeFunction, g: Optional[NativeFunctionsGroup] = None
+    ) -> Optional[str]:
+        with native_function_manager(f):
+            inplace_meta = False
+            gets_out_inplace_wrapper = False
+            if not self.backend_index.has_kernel(f):
+                if (
+                    self.backend_index.dispatch_key == DispatchKey.Meta
+                    and f.func.kind() is SchemaKind.inplace
+                    and
+                    # Defer to composites for meta implementation
+                    not f.has_composite_kernel
+                    and
+                    # Inplace list operations are not supported
+                    len(f.func.returns) == 1
+                ):
+                    inplace_meta = True
+                elif (
+                    not self.backend_index.use_out_as_primary
+                    and g is not None
+                    and gets_generated_out_inplace_wrapper(f, g, self.backend_index)
+                ):
+                    # We want to generate inplace/out wrappers, that don't have a kernel for the backend.
+                    gets_out_inplace_wrapper = True
+                else:
+                    return None
+            if f.manual_kernel_registration:
+                return None
+
+            if (
+                self.target is Target.REGISTRATION
+                and not self.selector.is_native_function_selected(f)
+            ):
+                return None
+
+            sig = self.wrapper_kernel_sig(f)
+
+            name = sig.name()
+            returns_type = sig.returns_type().cpp_type()
+            args = sig.arguments()
+            args_str = ", ".join(a.defn() for a in args)
+
+            # See Note [Direct dispatch bindings]
+            cpp_sig_group = CppSignatureGroup.from_native_function(
+                f, method=False, fallback_binding=False
+            )
+
+            # TODO: dedupe this with the structured codegen
+            if self.target is Target.NAMESPACED_DECLARATION:
+                result = ""
+                for cpp_sig in cpp_sig_group.signatures(symint=self.symint):
+                    result += f"TORCH_API {cpp_sig.decl()};\n"
+                return result
+            elif self.target is Target.NAMESPACED_DEFINITION:
+
+                def generate_defn(cpp_sig: CppSignature) -> str:
+                    return f"""
+{cpp_sig.defn()} {{
+return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))});
+}}
+"""
+
+                result = ""
+                for cpp_sig in cpp_sig_group.signatures(symint=self.symint):
+                    result += generate_defn(cpp_sig)
+                return result
+
+            elif self.target is Target.ANONYMOUS_DEFINITION:
+                # short circuit for inplace_meta
+                if inplace_meta:
+                    assert f.func.arguments.self_arg is not None
+                    self_arg_name = f.func.arguments.self_arg.argument.name
+                    # TODO: handle in place on tensor list
+                    return f"""
+{returns_type} {name}({args_str}) {{
+  TORCH_CHECK_NOT_IMPLEMENTED({self_arg_name}.is_meta(),
+    "Cannot inplace into non-meta tensor with meta tensor argument");
+  return {self_arg_name};
+}}
+"""
+
+                # short circuit for generated inplace/out wrappers
+                if gets_out_inplace_wrapper:
+                    return self.gen_out_inplace_wrapper(f, g)
+
+                metadata = self.backend_index.get_kernel(f)
+                if metadata is None:
+                    return None
+                if self.class_method_name is None:
+                    impl_name = f"{metadata.cpp_namespace}::{metadata.kernel}"
+                else:
+                    impl_name = f"{metadata.cpp_namespace}::{self.class_method_name}::{metadata.kernel}"
+
+                kernel_sig = kernel_signature(f, self.backend_index)
+
+                args_exprs_str = ", ".join(
+                    e.expr
+                    for e in translate(
+                        sig.arguments(), kernel_sig.arguments(), method=False
+                    )
+                )
+
+                device_check = "  // No device check\n"
+                # Backends that require device guards presumably also require device checks.
+                if self.backend_index.device_guard:
+                    device_check_args = itertools.chain(
+                        f.func.arguments.out, f.func.arguments.flat_positional
+                    )
+                    device_check = RegisterDispatchKey.gen_device_check(
+                        f.device_check, list(device_check_args), name
+                    )
+
+                device_guard = "// DeviceGuard omitted"  # default
+                if f.device_guard and self.backend_index.device_guard:
+                    has_tensor_options = any(
+                        isinstance(a, TensorOptionsArguments)
+                        for a in f.func.arguments.non_out
+                    )
+                    if has_tensor_options:
+                        # kernel is creating a tensor
+                        device_guard = """
+  const DeviceGuard device_guard(device_or_default(device));"""
+
+                        # CUDA requires special handling
+                        if is_cuda_dispatch_key(self.backend_index.dispatch_key):
+                            device_guard = (
+                                f"globalContext().lazyInitCUDA();\n{device_guard}"
+                            )
+                    else:
+                        # kernel is operating on existing tensors
+
+                        # There is precedence for which argument we use to do
+                        # device guard.  This describes the precedence order.
+                        self_arg = (
+                            [f.func.arguments.self_arg.argument]
+                            if f.func.arguments.self_arg is not None
+                            else []
+                        )
+                        candidate_args = itertools.chain(
+                            self_arg,
+                            f.func.arguments.out,
+                            f.func.arguments.flat_positional,
+                        )
+
+                        # Only tensor like arguments are eligible
+                        device_of = next(
+                            (
+                                f"{a.name}"
+                                for a in candidate_args
+                                if a.type.is_tensor_like()
+                            ),
+                            None,
+                        )
+                        if device_of is not None:
+                            device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));"
+
+                return f"""\
+namespace {{
+
+{returns_type} {name}({args_str}) {{
+  {device_check}
+
+  {device_guard}
+  return {impl_name}({args_exprs_str});
+}}
+
+}} // anonymous namespace
+"""
+
+            elif self.target is Target.REGISTRATION:
+                if f.manual_kernel_registration or self.skip_dispatcher_op_registration:
+                    return None
+                else:
+                    payload = f"TORCH_FN({name})"
+                    return f'm.impl("{f.func.name}",\n{payload});\n'
+            else:
+                assert_never(self.target)
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           STRUCTURED
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+@dataclass(frozen=True)
+class StructuredRegisterDispatchKey(RegisterDispatchKey):
+    g: NativeFunctionsGroup
+
+    def gen_class_set_output_functions(
+        self, k: SchemaKind, parent_class: str, generate_super: bool
+    ) -> str:
+        if generate_super:
+            set_output_super = f"{parent_class}::set_output_raw_strided(output_idx, sizes, strides, options, names);"
+        else:
+            set_output_super = ""
+
+        def gen_set_output_function(name: str, maybe_create_proxy: bool) -> str:
+            return f"""
+void set_output_{name}(
+    int64_t output_idx, IntArrayRef sizes, IntArrayRef strides,
+    TensorOptions options, DimnameList names
+) override {{
+{textwrap.indent(self.gen_class_set_output_body(k, maybe_create_proxy), "    ")}
+    if (!names.empty()) {{
+      namedinference::propagate_names(outputs_[output_idx], names);
+    }}
+    // super must happen after, so that downstream can use maybe_get_output
+    // to retrieve the output
+{textwrap.indent(set_output_super, "    ")}
+}}
+"""
+
+        return f"""
+{gen_set_output_function("strided", maybe_create_proxy=True)}
+{gen_set_output_function("raw_strided", maybe_create_proxy=False)}
+"""
+
+    def gen_class_set_output_body(self, k: SchemaKind, maybe_create_proxy: bool) -> str:
+        if self.backend_index.dispatch_key in [
+            DispatchKey.CUDA,
+            DispatchKey.MPS,
+            DispatchKey.CompositeExplicitAutogradNonFunctional,
+        ]:
+            maybe_set_guard = """
+auto current_device = guard_.current_device();
+if (C10_UNLIKELY(current_device.has_value())) {
+  TORCH_INTERNAL_ASSERT(*current_device == options.device(),
+    "structured kernels don't support multi-device outputs");
+} else {
+  guard_.reset_device(options.device());
+}
+"""
+            maybe_set_guard_line = maybe_set_guard + "\n"
+        else:
+            maybe_set_guard_line = maybe_set_guard = ""
+
+        if maybe_create_proxy:
+            create_proxy = """
+auto maybe_proxy = maybe_create_proxy(out, sizes, strides, options);
+if (C10_UNLIKELY(maybe_proxy.has_value())) {
+    proxy_outputs_[output_idx] = std::move(maybe_proxy).value();
+}
+"""
+        else:
+            create_proxy = ""
+
+        if k is SchemaKind.functional:
+            assert self.backend_index.dispatch_key in (
+                DispatchKey.Meta,
+                DispatchKey.CPU,
+                DispatchKey.CUDA,
+                DispatchKey.MPS,
+                DispatchKey.CompositeExplicitAutogradNonFunctional,
+            )
+            return f"""{maybe_set_guard_line}
+outputs_[output_idx] = create_out(sizes, strides, options);"""
+        elif k is SchemaKind.inplace:
+            return f"""{maybe_set_guard_line}
+const auto& out = outputs_[output_idx].get();
+check_inplace(out, sizes, options);
+{create_proxy}"""
+        elif k is SchemaKind.out:
+            return f"""{maybe_set_guard_line}
+const auto& out = outputs_[output_idx].get();
+resize_out(out, sizes, strides, options);
+{create_proxy}"""
+        elif k is SchemaKind.mutable or k is SchemaKind.scratch:
+            raise AssertionError(
+                f"{k} structured operators are currently not supported"
+            )
+        else:
+            assert_never(k)
+
+    # returns the definition of a ctor, as well as how to construct
+    # this class to a variable named op
+    def gen_class_ctor(self, k: SchemaKind, class_name: str, returns: int) -> str:
+        if k is SchemaKind.functional:
+            return ""
+        elif k is SchemaKind.inplace:
+            # TODO: Make sure out argument is guaranteed to be self
+            return f"{class_name}(Tensor& self) : outputs_{{std::ref(self)}} {{}}"
+        elif k is SchemaKind.out:
+            out_args = ", ".join(f"Tensor& out{i}" for i in range(returns))
+            out_refs = ", ".join(f"std::ref(out{i})" for i in range(returns))
+            return f"{class_name}({out_args}) : outputs_{{ {out_refs} }} {{}}"
+        elif k is SchemaKind.mutable or k is SchemaKind.scratch:
+            raise AssertionError(
+                f"{k} structured operators are currently not supported"
+            )
+        else:
+            assert_never(k)
+
+    def gen_class(
+        self,
+        f: NativeFunction,
+        k: SchemaKind,
+        *,
+        class_name: str,
+        parent_class: str,
+        generate_super: bool,
+    ) -> str:
+        if k is SchemaKind.functional:
+            output_type = "Tensor"
+            output_value = "outputs_[output_idx]"
+            proxy_field = ""
+        elif k is SchemaKind.inplace:
+            output_type = "std::reference_wrapper<Tensor>"
+            output_value = "proxy_outputs_[output_idx].has_value() ? *proxy_outputs_[output_idx] : outputs_[output_idx].get()"
+            proxy_field = f"std::array<c10::optional<Tensor>, {len(f.func.returns)}> proxy_outputs_;"
+        elif k is SchemaKind.out:
+            output_type = "std::reference_wrapper<Tensor>"
+            output_value = "proxy_outputs_[output_idx].has_value() ? *proxy_outputs_[output_idx] : outputs_[output_idx].get()"
+            proxy_field = f"std::array<c10::optional<Tensor>, {len(f.func.returns)}> proxy_outputs_;"
+
+        if self.backend_index.dispatch_key == DispatchKey.CUDA:
+            if self.rocm:
+                guard_field = "c10::hip::OptionalHIPGuardMasqueradingAsCUDA guard_;"
+            else:
+                guard_field = "c10::cuda::OptionalCUDAGuard guard_;"
+        elif (
+            self.backend_index.dispatch_key
+            == DispatchKey.CompositeExplicitAutogradNonFunctional
+        ):
+            guard_field = "c10::OptionalDeviceGuard guard_;"
+        elif self.backend_index.dispatch_key == DispatchKey.MPS:
+            # TODO: Move to OptionalMPSGuard.
+            guard_field = "c10::OptionalDeviceGuard guard_;"
+        else:
+            guard_field = ""
+
+        indent = " " * 4
+        class_ctor_str = self.gen_class_ctor(k, class_name, len(f.func.returns))
+        lines = (
+            f"struct {class_name} final : public {parent_class} {{",
+            f"{textwrap.indent(class_ctor_str, indent)}",
+            f"{textwrap.indent(self.gen_class_set_output_functions(k, parent_class, generate_super), indent)}",
+            "    const Tensor& maybe_get_output(int64_t output_idx) override {",
+            f"      return {output_value};\n",  # type: ignore[possibly-undefined]  # TODO: audit
+            "    }",
+            f"    std::array<{output_type}, {len(f.func.returns)}> outputs_;",  # type: ignore[possibly-undefined]  # TODO: audit
+            f"{textwrap.indent(proxy_field, indent)}",  # type: ignore[possibly-undefined]  # TODO: audit
+            f"{textwrap.indent(guard_field, indent)}",
+            "};",
+        )
+        return "\n".join(line for line in lines if line)
+
+    @method_with_native_function
+    def gen_one(self, f: NativeFunction) -> Optional[str]:
+        assert not f.manual_kernel_registration
+
+        if (
+            self.target is Target.REGISTRATION
+            and not self.selector.is_native_function_selected(f)
+        ):
+            return None
+
+        # TODO: Now, there is something interesting going on here.  In the code below,
+        # we generate CompositeExplicitAutogradNonFunctional implementations of functional and inplace
+        # based on the out implementation.  But in fact, out is definable by
+        # functional too (just not very efficiently), and this is honestly the
+        # MORE likely situation for a backend implementor.  How do we pick?
+        # Well, taking a page from Haskell type classes and default methods,
+        # we could conceivably register a circular definition (out in terms
+        # of functional, and functional in terms of out) and just require
+        # someone to implement one or the other.  We'd have to do a little bit
+        # of work to not register one of these "weak" definitions unless there
+        # is a strong definition somewhere in the DAG!  So it's not implemented yet.
+        if (
+            self.backend_index.dispatch_key
+            == DispatchKey.CompositeExplicitAutogradNonFunctional
+            and f.func.kind() is SchemaKind.out
+        ):
+            # Never generate a default implementation for out, that's what you
+            # have to define as a backend implementor
+            return None
+
+        # Note [Direct dispatch bindings]
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # Signature of the non-dispatched function we'll expose in a header
+        # (e.g., at::cpu::add).  We don't generate methods (TODO: do this
+        # when CPUTensor class is a thing); nor do we generate fallback
+        # bindings for manual_cpp_binding functions.
+        cpp_sig_group = CppSignatureGroup.from_native_function(
+            f, method=False, fallback_binding=False
+        )
+
+        # Signature of the wrapper function we'll register to the dispatcher
+        kern = self.backend_index.get_kernel(f)
+        sig = NativeSignature(
+            f.func,
+            prefix=f"wrapper_{self.backend_index.dispatch_key}_",
+            symint=kern is not None and kern.supports_symint(),
+        )
+
+        if self.target is Target.NAMESPACED_DECLARATION:
+            result = ""
+            for cpp_sig in cpp_sig_group.signatures(symint=self.symint):
+                result += f"TORCH_API {cpp_sig.decl()};\n"
+            return result
+
+        elif self.target is Target.NAMESPACED_DEFINITION:
+
+            def generate_defn(cpp_sig: CppSignature) -> str:
+                return f"""
+{cpp_sig.defn()} {{
+return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))});
+}}
+"""
+
+            result = ""
+            for cpp_sig in cpp_sig_group.signatures(symint=self.symint):
+                result += generate_defn(cpp_sig)
+            return result
+
+        elif self.target is Target.ANONYMOUS_DEFINITION:
+            k = f.func.kind()
+
+            # Construct the body of the wrapper function with signature sig
+            sig_body = []
+            # We'll use context to keep track of any variables we've brought
+            # into scope while generating code
+            context: List[Union[Binding, Expr]] = list(sig.arguments())
+
+            # Initialize the class corresponding to this structured
+            # operator; feeding it the output argument(s) if it is known
+            if self.backend_index.dispatch_key is DispatchKey.Meta:
+                class_name = f"structured_{meta.name(self.g)}_meta_{k.name}"
+                parent_class = f"at::meta::structured_{meta.name(self.g)}"
+            elif (
+                self.backend_index.dispatch_key
+                is DispatchKey.CompositeExplicitAutogradNonFunctional
+            ):
+                # TODO: dedup this branch
+                class_name = f"structured_{meta.name(self.g)}_default_backend_{k.name}"
+                parent_class = f"at::meta::structured_{meta.name(self.g)}"
+            else:
+                metadata = self.backend_index.get_kernel(self.g)
+                assert metadata is not None
+                class_name = f"structured_{metadata.kernel}_{k.name}"
+                parent_class = f"{metadata.cpp_namespace}::structured_{metadata.kernel}"
+
+            if self.backend_index.device_guard:
+                device_check_args = itertools.chain(
+                    f.func.arguments.out, f.func.arguments.flat_positional
+                )
+                sig_body.append(
+                    RegisterDispatchKey.gen_device_check(
+                        f.device_check, list(device_check_args), sig.name()
+                    )
+                )
+
+            if k is SchemaKind.functional:
+                sig_body.append(f"{class_name} op;")
+            elif k is SchemaKind.inplace:
+                sig_body.append(f"{class_name} op(self);")
+            elif k is SchemaKind.out:
+                out_args_str = ", ".join(a.name for a in f.func.arguments.out)
+                sig_body.append(f"{class_name} op({out_args_str});")
+
+            # Translate the input native arguments into structured
+            # arguments for the meta call
+            meta_exprs = ", ".join(
+                e.expr
+                for e in translate(
+                    context, structured.meta_arguments(self.g), method=False
+                )
+            )
+
+            if self.g.out.precomputed:
+                # If this function group has precomputed elements, the meta function
+                # returns a struct containing them which must be saved so that it
+                # can be unpacked when generating code to call the impl.
+                sig_body.append(f"auto precompute = op.meta({meta_exprs});")
+
+                # Put all of the contents of the precompute struct into the context
+                # so that translate will be able to return the correct args for the
+                # call to the impl.
+                precomputed_values = [
+                    *self.g.out.precomputed.replace.values(),
+                    self.g.out.precomputed.add,
+                ]
+                for precomputed_elems in precomputed_values:
+                    for arg in precomputed_elems:
+                        context.append(
+                            Expr(
+                                expr=f"precompute.{arg.name}",
+                                type=structured.argument_type(arg, binds=arg.name),
+                            )
+                        )
+
+                # Add a use of the precompute struct so FB internal compilers don't
+                # complain that there is an unused variable.
+                sig_body.append("(void)precompute;")
+            else:
+                sig_body.append(f"op.meta({meta_exprs});")
+
+            # After running meta, op.outputs_ is guaranteed to be valid;
+            # add it to the context
+            out_args = structured.out_arguments(self.g)
+            for i, out_arg in enumerate(out_args):
+                assert ConstRefCType(BaseCType(tensorT)) == out_arg.nctype.type
+
+                if k is SchemaKind.out:
+                    expr = f"op.maybe_get_output({i})"
+                else:
+                    expr = f"op.outputs_[{i}]"
+
+                context.append(
+                    Expr(
+                        expr=expr,
+                        # TODO: Stop hardcoding that the output type is a Tensor.  Note
+                        # that for the codegen here this is fine because outputs_ is
+                        # hardcoded to be tensor already
+                        type=NamedCType(
+                            out_arg.nctype.name, MutRefCType(BaseCType(tensorT))
+                        ),
+                    )
+                )
+
+            # With the expanded context, do the impl call (if not a meta
+            # function)
+            if (
+                self.backend_index.dispatch_key
+                == DispatchKey.CompositeExplicitAutogradNonFunctional
+            ):
+                # TODO: https://github.com/pytorch/pytorch/issues/53023
+                out_sig_group = CppSignatureGroup.from_native_function(
+                    self.g.out, method=False, fallback_binding=f.manual_cpp_binding
+                )
+                out_sig = out_sig_group.most_faithful_signature()
+                api_name = out_sig.name()
+                out_exprs = ", ".join(
+                    e.expr
+                    for e in translate(context, out_sig.arguments(), method=False)
+                )
+                # TODO: I think this means structured won't work with method
+                # only functions (but maybe you're saved by faithful? iunno.)
+                # NB: Originally I wrote this as an at::redispatch call, but
+                # I got in trouble because that meant I needed a DispatchKeySet
+                # in the wrapper function, which meant I needed a DispatchKeySet
+                # in the DispatchKeyFunctions declarations, but the defined API
+                # there does NOT permit a dispatch key set.  I think you can
+                # probably unwind this by calling some function to do the TLS
+                # fetch and get the DispatchKeySet when you don't have it, but
+                # I didn't do it for this version
+                sig_body.append(f"at::{api_name}({out_exprs});")
+            elif self.backend_index.dispatch_key != DispatchKey.Meta:
+                impl_exprs = ", ".join(
+                    e.expr
+                    for e in translate(
+                        context, structured.impl_arguments(self.g), method=False
+                    )
+                )
+                sig_body.append(f"op.impl({impl_exprs});")
+
+            # Go over each output, and check if there is a proxy created for it.
+            # If so, copy it over to the original output.
+            if k is SchemaKind.out or k is SchemaKind.inplace:
+                for i in range(len(f.func.returns)):
+                    sig_body.append(
+                        f"if (op.proxy_outputs_[{i}].has_value()) op.outputs_[{i}].get().copy_(*op.proxy_outputs_[{i}]);"
+                    )
+
+            # Destructively return the final tensors
+            # TODO: Do this in translate instead
+            if k is SchemaKind.functional:
+                if len(f.func.returns) == 1:
+                    ret_expr = "std::move(op.outputs_[0])"  # small optimization
+                else:
+                    moved = ", ".join(
+                        f"std::move(op.outputs_[{i}])"
+                        for i in range(len(f.func.returns))
+                    )
+                    ret_expr = f"std::make_tuple({moved})"
+            elif k is SchemaKind.inplace:
+                ret_expr = "self"
+            elif k is SchemaKind.out:
+                if len(f.func.returns) == 1:
+                    ret_expr = f.func.arguments.out[0].name
+                else:
+                    refs = ", ".join(a.name for a in f.func.arguments.out)
+                    ret_expr = f"std::forward_as_tuple({refs})"
+            sig_body.append(f"return {ret_expr};")  # type: ignore[possibly-undefined]  # TODO: audit
+
+            sig_body_str = "\n".join(sig_body)
+
+            # For an overview of what this template code looks like, see
+            # https://github.com/pytorch/rfcs/pull/9
+            return f"""\
+{self.gen_class(
+f, k,
+class_name=class_name,
+parent_class=parent_class,
+generate_super=self.g.out.structured_inherits is not None
+)}
+
+{sig.defn()} {{
+{sig_body_str}
+}}
+"""
+
+        elif self.target is Target.REGISTRATION:
+            return f'm.impl("{f.func.name}", TORCH_FN({sig.name()}));'
+        else:
+            assert_never(self.target)
+            # Silence mypy's "Missing return statement" error
+            return None
diff --git a/MLPY/Lib/site-packages/torchgen/dest/ufunc.py b/MLPY/Lib/site-packages/torchgen/dest/ufunc.py
new file mode 100644
index 0000000000000000000000000000000000000000..52268451ccfd19ea683fac751132581b1d3115db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/dest/ufunc.py
@@ -0,0 +1,545 @@
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torchgen.api.ufunc as ufunc
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    CType,
+    Expr,
+    NamedCType,
+    opmath_t,
+    scalar_t,
+    StructuredImplSignature,
+    VectorizedCType,
+)
+from torchgen.api.ufunc import UfunctorBindings
+from torchgen.context import with_native_function
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    DispatchKey,
+    NativeFunctionsGroup,
+    ScalarType,
+    UfuncKey,
+)
+from torchgen.utils import OrderedSet
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                                  CUDA STUFF
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+# NB: not bothering to generate dispatch stub forward declaration in header,
+# we can just paste it whereever necessary
+
+# TODO: use BackendIndex
+# dispatch_key: DispatchKey  # only CPU/CUDA right now
+
+
+# Represents functors for implementing CUDA ufuncs.
+# Functors are templated by scalar_t because when USERS instantiate functors
+# they are templated.  A functor looks something like this:
+#
+#   template <typename scalar_t>
+#   struct CUDAFunctorOnSelf_add {
+#     using opmath_t = at::opmath_type<scalar_t>;
+#     opmath_t other_;
+#     opmath_t alpha_;
+#     CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha)
+#         : other_(other), alpha_(alpha) {}
+#     __device__ scalar_t operator()(scalar_t self) {
+#       return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
+#     }
+#   };
+#
+@dataclass(frozen=True)
+class UfunctorSignature:
+    g: NativeFunctionsGroup
+    scalar_tensor_idx: Optional[int]
+    name: str
+
+    def arguments(self) -> UfunctorBindings:
+        return ufunc.ufunctor_arguments(
+            self.g, scalar_tensor_idx=self.scalar_tensor_idx, scalar_t=scalar_t
+        )
+
+    def fields(self) -> List[Binding]:
+        # fields are renamed to have a trailing underscore, as is conventional
+        return [b.rename(f"{b.name}_") for b in self.arguments().ctor]
+
+    def returns_type(self) -> CType:
+        # TODO: don't hardcode; return type will be inferred based on tags on
+        # the native function
+        return BaseCType(scalar_t)
+
+    def decl_fields(self) -> str:
+        return "\n".join(f"{f.type} {f.name};" for f in self.fields())
+
+    def inline_defn_ctor(self) -> str:
+        args_str = ", ".join(a.decl() for a in self.arguments().ctor)
+        # NB: hypothetically could do this with translate but the
+        # transition here is very regular
+        init_str = ", ".join(f"{a.name}_({a.name})" for a in self.arguments().ctor)
+        return f"{self.name}({args_str}) : {init_str} {{}}"
+
+    def decl_apply(self) -> str:
+        args_str = ", ".join(a.decl() for a in self.arguments().apply)
+        return f"{self.returns_type().cpp_type()} operator()({args_str}) const"
+
+
+@dataclass(frozen=True)
+class UfuncSignature:
+    g: NativeFunctionsGroup
+    name: str
+    compute_t: CType
+
+    def arguments(self) -> List[Binding]:
+        return ufunc.ufunc_arguments(self.g, compute_t=self.compute_t)
+
+    def call(self, ctx: Sequence[Union[Binding, Expr]]) -> str:
+        return f"{self.name}({', '.join(a.expr for a in translate(ctx, self.arguments()))})"
+
+
+# steps:
+#   1. take the functional signature
+#   2. use api.ufunc to convert it to template signature.  this establishes
+#      the type of the template function
+#   3. use api.ufunc (II) to generate a split struct / operator() signature.
+#      this establish context in which we call the template signature
+#
+# StructuredImplSignature context
+#   ~> functor constructor sig
+#
+# Functor constructor context
+#   ~> functor fields sig
+#
+# Functor apply context (functor fields + functor apply sig)
+#   ~> template sig
+#
+
+
+def eligible_for_binary_scalar_specialization(g: NativeFunctionsGroup) -> bool:
+    num_tensors = sum(
+        1 for a in g.functional.func.arguments.flat_non_out if a.type.is_tensor_like()
+    )
+    return num_tensors == 2
+
+
+def compute_ufunc_cuda_functors(
+    g: NativeFunctionsGroup,
+) -> Tuple[Dict[ScalarType, Dict[UfuncKey, UfunctorSignature]], str]:
+    # First, build the functors.
+    ufunctor_sigs: Dict[ScalarType, Dict[UfuncKey, UfunctorSignature]] = {}
+    ufunctors: List[str] = []
+    loops = g.out.ufunc_inner_loop
+    scalar_tensor_idx_lookup = {
+        UfuncKey.CUDAFunctorOnSelf: 1,
+        UfuncKey.CUDAFunctorOnOther: 0,
+        UfuncKey.CUDAFunctor: None,
+    }
+    if eligible_for_binary_scalar_specialization(g):
+        keys = [
+            UfuncKey.CUDAFunctorOnSelf,
+            UfuncKey.CUDAFunctorOnOther,
+            UfuncKey.CUDAFunctor,
+        ]
+    else:
+        keys = [UfuncKey.CUDAFunctor]
+        for k in [UfuncKey.CUDAFunctorOnSelf, UfuncKey.CUDAFunctorOnOther]:
+            assert k not in loops, f"cannot use {k} on non-binary function"
+    for k in keys:
+        # If the key was directly defined, skip functor codegen; we assume the
+        # user already done it for us
+        if k in loops:
+            ufunctor_sig = UfunctorSignature(
+                g, scalar_tensor_idx=scalar_tensor_idx_lookup[k], name=loops[k].name
+            )
+            for dtype in loops[k].supported_dtypes:
+                ufunctor_sigs.setdefault(dtype, {})[k] = ufunctor_sig
+            continue
+
+        # Note [ScalarOnly and Generic must match names for CUDA]
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # Otherwise, look in ANY of the generic entries.  For simplicity of
+        # codegen, both ScalarOnly and Generic are defined, the ufunc name
+        # must match  (if they didn't match, we'd have to generate distinct
+        # functors per dtype, which is awful, so we're not going to do it unless
+        # someone really forces us to)
+        ufunc_name = None
+        supported_dtypes: OrderedSet[ScalarType] = OrderedSet()
+        for lk in [UfuncKey.ScalarOnly, UfuncKey.Generic]:
+            if lk not in loops:
+                continue
+            if ufunc_name is None:
+                ufunc_name = loops[lk].name
+            else:
+                # See Note [ScalarOnly and Generic must match names for CUDA]
+                assert (
+                    ufunc_name == loops[lk].name
+                ), "ScalarOnly and Generic must have same ufunc name"
+            supported_dtypes |= loops[lk].supported_dtypes
+        assert ufunc_name is not None
+
+        name = f"{k}_{ufunc_name}"
+        ufunctor_sig = UfunctorSignature(
+            g, scalar_tensor_idx=scalar_tensor_idx_lookup[k], name=name
+        )
+        for dtype in supported_dtypes:
+            ufunctor_sigs.setdefault(dtype, {})[k] = ufunctor_sig
+
+        ufunc_sig = UfuncSignature(
+            g, name=f"ufunc::{ufunc_name}", compute_t=BaseCType(opmath_t)
+        )
+        apply_ctx = ufunctor_sig.fields() + ufunctor_sig.arguments().apply
+        ufunctors.append(
+            f"""
+template <typename scalar_t>
+struct {ufunctor_sig.name} {{
+  using opmath_t = at::opmath_type<scalar_t>;
+  {ufunctor_sig.decl_fields()}
+  {ufunctor_sig.inline_defn_ctor()}
+  __device__ {ufunctor_sig.decl_apply()} {{
+    return {ufunc_sig.call(apply_ctx)};
+  }}
+}};
+"""
+        )
+
+    return ufunctor_sigs, "\n".join(ufunctors)
+
+
+@dataclass(frozen=True)
+class BinaryScalarSpecializationConfig:
+    scalar_idx: int
+    ctor_tensor: str
+    ufunc_key: UfuncKey
+
+
+BinaryScalarSpecializationConfigs = [
+    BinaryScalarSpecializationConfig(
+        scalar_idx=0,
+        ctor_tensor="self",
+        ufunc_key=UfuncKey.CUDAFunctorOnOther,
+    ),
+    BinaryScalarSpecializationConfig(
+        scalar_idx=1,
+        ctor_tensor="other",
+        ufunc_key=UfuncKey.CUDAFunctorOnSelf,
+    ),
+]
+
+
+def compute_ufunc_cuda_dtype_body(
+    g: NativeFunctionsGroup,
+    dtype: ScalarType,
+    inner_loops: Dict[UfuncKey, UfunctorSignature],
+    parent_ctx: Sequence[Binding],
+) -> str:
+    body = "using opmath_t = at::opmath_type<scalar_t>;"
+    body += "if (false) {}\n"  # for ease of codegen
+    for config in BinaryScalarSpecializationConfigs:
+        if config.ufunc_key not in inner_loops:
+            continue
+        ufunctor_sig = inner_loops[config.ufunc_key]
+        scalar_idx = config.scalar_idx + 1
+        # Make a copy and at the same time widen the type (not permissible
+        # without copy; we don't want to mutate the input argument anyway)
+        ctx: List[Union[Expr, Binding]] = list(parent_ctx)
+        ctx.append(
+            Expr(
+                expr=f"iter.scalar_value<opmath_t>({scalar_idx})",
+                type=NamedCType(config.ctor_tensor, BaseCType(opmath_t)),
+            )
+        )
+        ufunctor_ctor_exprs_str = ", ".join(
+            a.expr for a in translate(ctx, ufunctor_sig.arguments().ctor)
+        )
+
+        # NB: ufunctor must be allocated before iter.remove_operand is called,
+        # as it relies on iter
+        body += f"""\
+else if (iter.is_cpu_scalar({scalar_idx})) {{
+  {ufunctor_sig.name}<scalar_t> ufunctor({ufunctor_ctor_exprs_str});
+  iter.remove_operand({scalar_idx});
+  gpu_kernel(iter, ufunctor);
+}}"""
+
+    ufunctor_sig = inner_loops[UfuncKey.CUDAFunctor]
+    ufunctor_ctor_exprs_str = ", ".join(
+        a.expr for a in translate(parent_ctx, ufunctor_sig.arguments().ctor)
+    )
+    body += f"""
+else {{
+  gpu_kernel(iter, {ufunctor_sig.name}<scalar_t>({ufunctor_ctor_exprs_str}));
+}}
+    """
+    return body
+
+
+@with_native_function
+def compute_ufunc_cuda(g: NativeFunctionsGroup) -> str:
+    # First, build the functors, indexing them by dtype
+    ufunctor_sigs, ufunctors = compute_ufunc_cuda_functors(g)
+
+    # Next, build the conditionals
+    sig = StructuredImplSignature(g, ufunc.kernel_name(g, DispatchKey.CUDA))
+    dtype_cases = []
+    for dtype, inner_ufunc_sigs in ufunctor_sigs.items():
+        dtype_cases.append(
+            f"""
+AT_DISPATCH_CASE(at::ScalarType::{dtype},
+  [&]() {{
+    {compute_ufunc_cuda_dtype_body(g, dtype, inner_ufunc_sigs, sig.arguments())}
+  }}
+)
+"""
+        )
+
+    dtype_cases_str = "\n".join(dtype_cases)
+
+    stub_sig = StubSignature(g)
+
+    return f"""
+{ufunctors}
+
+{stub_sig.type_defn()};
+{stub_sig.dispatch_decl()};
+
+{stub_sig.kernel_defn()} {{
+  AT_DISPATCH_SWITCH(iter.common_dtype(), "{sig.name}",
+    {dtype_cases_str}
+  );
+}}
+REGISTER_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name});
+
+{sig.defn()} {{
+  {stub_sig.direct_call(sig.arguments())};
+}}
+"""
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                                   CPU STUFF
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+@dataclass(frozen=True)
+class StubSignature:
+    g: NativeFunctionsGroup
+
+    @property
+    def name(self) -> str:
+        return f"{str(self.g.functional.func.name.name)}_stub"
+
+    @property
+    def kernel_name(self) -> str:
+        return f"{str(self.g.functional.func.name.name)}_kernel"
+
+    @property
+    def type_name(self) -> str:
+        return f"{str(self.g.functional.func.name.name)}_fn"
+
+    def arguments(self) -> List[Binding]:
+        return ufunc.stub_arguments(self.g)
+
+    def type(self) -> str:
+        cpp_args = self.arguments()
+        return f"void(*)(TensorIteratorBase&, {', '.join(a.type for a in cpp_args)})"
+
+    def dispatch_decl(self) -> str:
+        return f"DECLARE_DISPATCH({self.type_name}, {self.name})"
+
+    def dispatch_defn(self) -> str:
+        return f"DEFINE_DISPATCH({self.name})"
+
+    def kernel_defn(self) -> str:
+        return f"void {self.kernel_name}(TensorIteratorBase& iter, {', '.join(a.defn() for a in self.arguments())})"
+
+    def type_defn(self) -> str:
+        return f"using {self.type_name} = {self.type()}"
+
+    # must be called from context where this is TensorIteratorBase*
+    def call(self, ctx: Sequence[Binding]) -> str:
+        return f"{self.name}(device_type(), *this, {', '.join(a.expr for a in translate(ctx, self.arguments()))})"
+
+    # used in CUDA to skip the unnecessary dynamic dispatch
+    def direct_call(self, ctx: Sequence[Binding]) -> str:
+        return f"{self.kernel_name}(*this, {', '.join(a.expr for a in translate(ctx, self.arguments()))})"
+
+
+@with_native_function
+def compute_ufunc_cpu(g: NativeFunctionsGroup) -> str:
+    stub_sig = StubSignature(g)
+    sig = StructuredImplSignature(g, ufunc.kernel_name(g, DispatchKey.CPU))
+
+    return f"""
+{stub_sig.type_defn()};
+{stub_sig.dispatch_decl()};
+{stub_sig.dispatch_defn()};
+
+{sig.defn()} {{
+  {stub_sig.call(sig.arguments())};
+}}
+"""
+
+
+def compute_ufunc_cpu_dtype_body(
+    g: NativeFunctionsGroup,
+    dtype: ScalarType,
+    inner_loops: Dict[UfuncKey, UfuncSignature],
+    parent_ctx: Sequence[Binding],
+) -> str:
+    assert UfuncKey.CPUScalar in inner_loops, f"{dtype}, {inner_loops.keys()}"
+    assert inner_loops.keys() <= {UfuncKey.CPUScalar, UfuncKey.CPUVector}
+    scalar_loop = inner_loops[UfuncKey.CPUScalar]
+    vec_loop = None
+    if UfuncKey.CPUVector in inner_loops:
+        vec_loop = inner_loops[UfuncKey.CPUVector]
+
+    # NB: We DON'T use translate here, because translate is
+    # incapable of CSE'ing the scalar accesses in case it is also
+    # used by Vectorized; also, the unpacking here is very simple
+    # and only affects Scalar; everything else is implicitly captured
+    # by the lambda
+
+    # Setup scalar in scope
+    body = []
+    ctx = []
+    for b in parent_ctx:
+        if isinstance(b.argument, Argument) and b.argument.type != BaseType(
+            BaseTy.Scalar
+        ):
+            continue
+        body.append(f"auto _s_{b.name} = {b.name}.to<scalar_t>();")
+        ctx.append(Expr(f"_s_{b.name}", NamedCType(b.nctype.name, BaseCType(scalar_t))))
+    if vec_loop is not None:
+        for b in parent_ctx:
+            if isinstance(b.argument, Argument) and b.argument.type != BaseType(
+                BaseTy.Scalar
+            ):
+                continue
+            body.append(
+                f"auto _v_{b.name} = at::vec::Vectorized<scalar_t>(_s_{b.name});"
+            )
+            ctx.append(
+                Expr(
+                    f"_v_{b.name}",
+                    NamedCType(b.nctype.name, VectorizedCType(BaseCType(scalar_t))),
+                )
+            )
+
+    # Setup lambda signature
+    # NB: simplified version of ufunctor_arguments
+    scalar_bindings = []
+    vec_bindings = []
+    for a in g.functional.func.arguments.flat_non_out:
+        if not a.type.is_tensor_like():
+            continue
+        assert a.type == BaseType(BaseTy.Tensor)
+        scalar_bindings.append(
+            Binding(
+                name=a.name,
+                nctype=NamedCType(a.name, BaseCType(scalar_t)),
+                argument=a,
+            )
+        )
+        if vec_loop is not None:
+            vec_bindings.append(
+                Binding(
+                    name=a.name,
+                    nctype=NamedCType(a.name, VectorizedCType(BaseCType(scalar_t))),
+                    argument=a,
+                )
+            )
+
+    def with_ctx(b: Sequence[Binding]) -> List[Union[Expr, Binding]]:
+        r: List[Union[Expr, Binding]] = []
+        r.extend(ctx)
+        r.extend(b)
+        return r
+
+    body_str = "\n".join(body)
+    if vec_loop is not None:
+        return f"""
+{body_str}
+cpu_kernel_vec(iter,
+  [=]({', '.join(b.decl() for b in scalar_bindings)}) {{ return {scalar_loop.call(with_ctx(scalar_bindings))}; }},
+  [=]({', '.join(b.decl() for b in vec_bindings)}) {{ return {vec_loop.call(with_ctx(vec_bindings))}; }}
+);
+"""
+    else:
+        return f"""
+{body_str}
+cpu_kernel(iter,
+  [=]({', '.join(b.decl() for b in scalar_bindings)}) {{ return {scalar_loop.call(with_ctx(scalar_bindings))}; }}
+);
+"""
+
+
+@with_native_function
+def compute_ufunc_cpu_kernel(g: NativeFunctionsGroup) -> str:
+    stub_sig = StubSignature(g)
+
+    # Reindex the ufunc by dtypes; processing generic/scalaronly as well
+    loops = g.out.ufunc_inner_loop
+    ufunc_sigs: Dict[ScalarType, Dict[UfuncKey, UfuncSignature]] = {}
+    for k in [UfuncKey.CPUScalar, UfuncKey.CPUVector]:
+        lks = []
+        # ORDER MATTERS: this specifies overriding precedence
+        if k in loops:  # should happen rarely
+            lks.append(k)
+        if UfuncKey.ScalarOnly in loops and k is UfuncKey.CPUScalar:
+            lks.append(UfuncKey.ScalarOnly)
+        if UfuncKey.Generic in loops:
+            lks.append(UfuncKey.Generic)
+        # TODO: don't hardcode ufunc:: namespace here, should be centralized smh
+        for lk in lks:
+            for dtype in loops[lk].supported_dtypes:
+                compute_t: CType
+                if k is UfuncKey.CPUScalar:
+                    compute_t = BaseCType(scalar_t)
+                elif k is UfuncKey.CPUVector:
+                    compute_t = VectorizedCType(BaseCType(scalar_t))
+                else:
+                    raise AssertionError()
+                inner_ufunc_sigs = ufunc_sigs.setdefault(dtype, {})
+                if k not in inner_ufunc_sigs:
+                    inner_ufunc_sigs[k] = UfuncSignature(
+                        g, name=f"ufunc::{loops[lk].name}", compute_t=compute_t
+                    )
+
+    # Build the conditionals
+    dtype_cases = []
+    for dtype, inner_ufunc_sigs in ufunc_sigs.items():
+        dtype_cases.append(
+            f"""
+AT_DISPATCH_CASE(at::ScalarType::{dtype},
+  [&]() {{
+    {compute_ufunc_cpu_dtype_body(g, dtype, inner_ufunc_sigs, stub_sig.arguments())}
+  }}
+)
+"""
+        )
+
+    dtype_cases_str = "\n".join(dtype_cases)
+    return f"""
+namespace {{
+
+{stub_sig.kernel_defn()} {{
+  AT_DISPATCH_SWITCH(iter.common_dtype(), "{stub_sig.name}",
+    {dtype_cases_str}
+  );
+}}
+
+}} // anonymous namespace
+
+{stub_sig.type_defn()};
+{stub_sig.dispatch_decl()};
+REGISTER_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name});
+"""
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/__init__.py b/MLPY/Lib/site-packages/torchgen/executorch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/executorch/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55d4acb331d655ed83b137760e9f9a42e4a53e51
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/executorch/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/__pycache__/model.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/executorch/__pycache__/model.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2838abb7f2ff928f533ef48eb785b23486fbceb9
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/executorch/__pycache__/model.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/__pycache__/parse.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/executorch/__pycache__/parse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fec643674a9ea0bd081348674b3f6d93fcabeba
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/executorch/__pycache__/parse.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/__init__.py b/MLPY/Lib/site-packages/torchgen/executorch/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..baeecad59b41a42c38ce93e9a63c6758b0884d31
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/custom_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/custom_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b8d9ca8a8895e68584df331a6941f80bc4815b1
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/custom_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/et_cpp.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/et_cpp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4f8ab3e0a78ddfd052420d5d5397f1e92938e13
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/et_cpp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/unboxing.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/unboxing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6b2ea7ca57d58ec900533657396dfb33dffd598
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/executorch/api/__pycache__/unboxing.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/custom_ops.py b/MLPY/Lib/site-packages/torchgen/executorch/api/custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f606cbd00d9227437823ea7b1756b6538e8c55b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/executorch/api/custom_ops.py
@@ -0,0 +1,142 @@
+from collections import defaultdict
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence, Tuple
+
+from torchgen import dest
+
+# disable import sorting to avoid circular dependency.
+from torchgen.api.types import DispatcherSignature  # isort:skip
+from torchgen.context import method_with_native_function
+from torchgen.executorch.model import ETKernelIndex
+from torchgen.model import BaseTy, BaseType, DispatchKey, NativeFunction, Variant
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import concatMap, Target
+
+
+# Generates RegisterKernelStub.cpp, which provides placeholder kernels for custom operators. This will be used at
+# model authoring side.
+@dataclass(frozen=True)
+class ComputeNativeFunctionStub:
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        if Variant.function not in f.variants:
+            return None
+
+        sig = DispatcherSignature.from_schema(
+            f.func, prefix=f"wrapper_CPU_{f.func.name.overload_name}_", symint=False
+        )
+        assert sig is not None
+        if len(f.func.returns) == 0:
+            ret_name = ""
+        elif len(f.func.returns) == 1:
+            if f.func.arguments.out:
+                ret_name = f.func.arguments.out[0].name
+            else:
+                ret_name = next(
+                    (
+                        a.name
+                        for a in f.func.arguments.flat_non_out
+                        if a.type == f.func.returns[0].type
+                    ),
+                    "",
+                )
+            if not ret_name:
+                # if return type is tensor
+                if f.func.returns[0].type == BaseType(BaseTy.Tensor):
+                    # Returns an empty tensor
+                    ret_name = "at::Tensor()"
+                else:
+                    raise Exception(f"Can't handle this return type {f.func}")
+        elif len(f.func.arguments.out) == len(f.func.returns):
+            # Returns a tuple of out arguments
+            tensor_type = "at::Tensor &"
+            comma = ", "
+            ret_name = f"""::std::tuple<{comma.join([tensor_type] * len(f.func.returns))}>(
+                {comma.join([r.name for r in f.func.arguments.out])}
+            )"""
+        else:
+            assert all(
+                a.type == BaseType(BaseTy.Tensor) for a in f.func.returns
+            ), f"Only support tensor returns but got {f.func.returns}"
+            # Returns a tuple of empty tensors
+            tensor_type = "at::Tensor"
+            comma = ", "
+            ret_name = f"""::std::tuple<{comma.join([tensor_type] * len(f.func.returns))}>(
+                {comma.join(["at::Tensor()" for _ in f.func.returns])}
+            )"""
+        ret_str = f"return {ret_name};" if len(f.func.returns) > 0 else ""
+        return f"""
+{sig.defn()} {{
+    {ret_str}
+}}
+    """
+
+
+def gen_custom_ops_registration(
+    *,
+    native_functions: Sequence[NativeFunction],
+    selector: SelectiveBuilder,
+    kernel_index: ETKernelIndex,
+    rocm: bool,
+) -> Tuple[str, str]:
+    """
+    Generate custom ops registration code for dest.RegisterDispatchKey.
+
+    :param native_functions: a sequence of `NativeFunction`
+    :param selector: for selective build.
+    :param kernel_index: kernels for all the ops.
+    :param rocm: bool for dest.RegisterDispatchKey.
+    :return: generated C++ code to register custom operators into PyTorch
+    """
+
+    # convert kernel index to BackendIndex. This is because we can't handle ETKernelIndex yet.
+    # TODO larryliu: evaluate if this code is still needed. If yes let it handle ETKernelIndex.
+
+    dispatch_key = DispatchKey.CPU
+    backend_index = kernel_index._to_backend_index()
+    static_init_dispatch_registrations = ""
+    ns_grouped_native_functions: Dict[str, List[NativeFunction]] = defaultdict(list)
+    for native_function in native_functions:
+        ns_grouped_native_functions[native_function.namespace].append(native_function)
+
+    for namespace, functions in ns_grouped_native_functions.items():
+        if len(functions) == 0:
+            continue
+        dispatch_registrations_body = "\n".join(
+            list(
+                concatMap(
+                    dest.RegisterDispatchKey(
+                        backend_index,
+                        Target.REGISTRATION,
+                        selector,
+                        rocm=rocm,
+                        symint=False,
+                        class_method_name=None,
+                        skip_dispatcher_op_registration=False,
+                    ),
+                    functions,
+                )
+            )
+        )
+        static_init_dispatch_registrations += f"""
+TORCH_LIBRARY_IMPL({namespace}, {dispatch_key}, m) {{
+{dispatch_registrations_body}
+}};"""
+    anonymous_definition = "\n".join(
+        list(
+            concatMap(
+                dest.RegisterDispatchKey(
+                    backend_index,
+                    Target.ANONYMOUS_DEFINITION,
+                    selector,
+                    rocm=rocm,
+                    symint=False,
+                    class_method_name=None,
+                    skip_dispatcher_op_registration=False,
+                ),
+                native_functions,
+            )
+        )
+    )
+    return anonymous_definition, static_init_dispatch_registrations
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/et_cpp.py b/MLPY/Lib/site-packages/torchgen/executorch/api/et_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b65745e277b1272aa8fdfd4cfc85c17ee9256d2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/executorch/api/et_cpp.py
@@ -0,0 +1,368 @@
+from typing import List, Optional, Sequence, Set, Union
+
+from torchgen import local
+from torchgen.api.types import (
+    ArgName,
+    ArrayCType,
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    CType,
+    MutRefCType,
+    NamedCType,
+    SpecialArgName,
+    TupleCType,
+    VectorCType,
+    voidT,
+)
+from torchgen.model import (
+    Argument,
+    Arguments,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import assert_never
+from .types import (
+    ArrayRefCType,
+    BaseTypeToCppMapping,
+    OptionalCType,
+    scalarT,
+    tensorListT,
+    tensorT,
+)
+
+"""
+This file describes the translation of JIT schema to the public C++ API, which is what people use when they call
+functions like at::add. It also serves as a native function API, which is the signature of kernels,
+since in Executorch CppSignature is the same as NativeSignature.
+
+Difference between this file and torchgen.api.cpp.py:
+
+  - Executorch doesn't support TensorOptions, however in this file we still keep the logic here to be compatible with
+    torchgen.api.cpp, so that we can do stuff like ATen mode (running ATen kernels in Executorch).
+
+  - Executorch doesn't support Dimname.
+
+  - Executorch runtime doesn't support SymInt, will treat it as int.
+"""
+
+
+# Translation of "value types" in JIT schema to C++ API type.  Value
+# types look the same no matter if they are argument types or return
+# types.  Returns None if the type in question is not a value type.
+def valuetype_type(
+    t: Type,
+    *,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+) -> Optional[NamedCType]:
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor or t.name == BaseTy.Scalar:
+            return None
+        # For SymInt we simply treat it as int.
+        elif str(t) == "SymInt":
+            return NamedCType(binds, BaseCType(BaseTypeToCppMapping[BaseTy.int]))
+        if remove_non_owning_ref_types:
+            if t.name == BaseTy.str:
+                raise AssertionError(
+                    "string ref->value conversion: not implemented yet"
+                )
+        # All other BaseType currently map directly to BaseCppTypes.
+        return NamedCType(binds, BaseCType(BaseTypeToCppMapping[t.name]))
+    elif isinstance(t, OptionalType):
+        elem = valuetype_type(t.elem, binds=binds)
+        if elem is None:
+            return None
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        if str(t.elem) == "bool":
+            assert t.size is not None
+            return NamedCType(
+                binds, ArrayCType(BaseCType(BaseTypeToCppMapping[BaseTy.bool]), t.size)
+            )
+        else:
+            return None
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translation of types occurring in JIT arguments to a C++ argument type.
+# If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type.
+# For example, we'll return std::vector<int> instead of IntArrayRef.
+# See Note [translation from C++ reference to value types]
+def argumenttype_type(
+    t: Type,
+    *,
+    mutable: bool,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+) -> NamedCType:
+    # If it's a value type, do the value type translation
+    r = valuetype_type(
+        t,
+        binds=binds,
+        remove_non_owning_ref_types=remove_non_owning_ref_types,
+    )
+    if r is not None:
+        return r
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(binds, MutRefCType(BaseCType(tensorT)))
+            else:
+                return NamedCType(binds, ConstRefCType(BaseCType(tensorT)))
+        elif t.name == BaseTy.Scalar:
+            return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+        else:
+            raise AssertionError(f"base type should have been value type {t}")
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == "Tensor":
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(
+                    binds, MutRefCType(BaseCType(tensorT))
+                )  # TODO: fix this discrepancy
+            else:
+                return NamedCType(
+                    binds, ConstRefCType(OptionalCType(BaseCType(tensorT)))
+                )
+        elif str(t.elem) == "Scalar":
+            return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        # TODO: keeping these special cases for Tensor[] and Tensor?[] so that we can hookup with ATen kernels.
+        if str(t.elem) == "Tensor":
+            return NamedCType(binds, BaseCType(tensorListT))
+        elif str(t.elem) == "Dimname":
+            raise NotImplementedError("Executorch doesn't support Dimname")
+        elif str(t.elem) == "Tensor?":
+            return NamedCType(binds, ArrayRefCType(OptionalCType(BaseCType(tensorT))))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, ArrayRefCType(elem.type))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translate a JIT argument into its C++ type
+def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
+
+
+# Translation of a (non-multi) return type from JIT to C++
+# N.B: returntype_type returns a CType, not a NamedCType.
+# This is mostly because of the mismatch between return types and return names.
+# e.g. a function with a return type of 'void' has 0 return names,
+# and a function with a return type of 'std::tuple' has >1 return name.
+def returntype_type(t: Type, *, mutable: bool) -> CType:
+    # placeholder is ignored
+    r = valuetype_type(t, binds="__placeholder__")
+    if r is not None:
+        return r.type
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable:
+                if local.use_const_ref_for_mutable_tensors():
+                    return ConstRefCType(BaseCType(tensorT))
+                else:
+                    return MutRefCType(BaseCType(tensorT))
+            else:
+                # Note [Tensor Copy Returns]
+                # Currently, we use "Argument.is_write" to determine
+                # whether or not Tensor return types should be copies or references.
+                # If that ever changes, take a look at other locations of this note!
+                return BaseCType(tensorT)
+        elif t.name == BaseTy.Scalar:
+            return BaseCType(scalarT)
+    elif isinstance(t, ListType):
+        assert (
+            not mutable
+        ), "Native functions should never return a mutable tensor list. They should return void."
+        elem = returntype_type(t.elem, mutable=False)
+        assert t.size is None, f"fixed size list returns not supported: {t}"
+        return VectorCType(elem)
+
+    raise AssertionError(f"unrecognized return type {t}")
+
+
+# Translation of a single return to its C++ type
+def return_type(r: Return) -> CType:
+    return returntype_type(r.type, mutable=r.is_write)
+
+
+# Translation of a full (possibly multi) return from JIT to its C++ type
+def returns_type(rs: Sequence[Return]) -> CType:
+    if len(rs) == 0:
+        return BaseCType(voidT)
+    elif len(rs) == 1:
+        return return_type(rs[0])
+    else:
+        return TupleCType([return_type(r) for r in rs])
+
+
+def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequence[str]:
+    returns: List[str] = []
+    for i, r in enumerate(f.func.returns):
+        # If we have an inplace function, the return argument is
+        # implicitly named self.
+        # TODO: Consider incorporating this into the data model
+        if f.func.name.name.inplace:
+            assert i == 0, "illegal inplace function with multiple returns"
+            name = "self"
+        # If we are out function, the name is the name of the
+        # corresponding output function (r.name will get recorded
+        # in field_name later.)
+        elif f.func.is_out_fn():
+            name = f.func.arguments.out[i].name
+        # If the return argument is explicitly named...
+        elif r.name:
+            name_conflict = any(
+                r.name == a.name for a in f.func.schema_order_arguments()
+            )
+            if name_conflict and not f.func.is_out_fn():
+                name = f"{r.name}_return"
+            else:
+                name = r.name
+        # If there is no explicit name and no fallback name was passed in, we just name the output result,
+        # unless it's a multi-return, in which case it's result0,
+        # result1, etc (zero-indexed)
+        else:
+            name = fallback_name if len(f.func.returns) == 1 else f"{fallback_name}{i}"
+        returns.append(name)
+    return returns
+
+
+JIT_TO_CPP_DEFAULT = {
+    "False": "false",
+    "True": "true",
+    "None": "torch::executorch::nullopt",  # UGH this one is type directed
+    "[]": "{}",
+    "contiguous_format": "torch::executorch::MemoryFormat::Contiguous",
+    "long": "torch::executorch::kLong",
+}
+
+
+# Convert a JIT default into C++ expression representing the default
+def default_expr(d: str, t: Type) -> str:
+    if d == "None" and str(t) == "Tensor?":
+        return "{}"
+    if isinstance(t, BaseType) and t.name is BaseTy.str:
+        # Schema allows single quotes but C++ needs double
+        if len(d) >= 2 and d[0] == "'" and d[-1] == "'":
+            s = ""
+            i = 1
+            while i + 1 < len(d):
+                if d[i] != "\\":
+                    if d[i] == '"':
+                        s += '\\"'
+                    else:
+                        s += d[i]
+                    i += 1
+                else:
+                    if d[i + 1] == "'":
+                        s += "'"
+                    else:
+                        s += d[i : i + 2]
+                    i += 2
+
+            return f'"{s}"'
+
+    if isinstance(t, OptionalType):
+        if d == "None":
+            return "torch::executor::nullopt"
+
+        return default_expr(d, t.elem)
+
+    if isinstance(t, ListType):
+        if d.startswith("[") and d.endswith("]"):
+            return "{" + d[1:-1] + "}"
+        elif t.size is None:
+            # NOTE: Sized lists can have scalar defaults
+            raise ValueError(f"Expected a list default '[...]' but found: '{d}'")
+
+    return JIT_TO_CPP_DEFAULT.get(d, d)
+
+
+# Convert an argument into its C++ API form
+
+
+def argument(
+    a: Union[Argument, TensorOptionsArguments, SelfArgument],
+    *,
+    cpp_no_default_args: Set[str],
+    method: bool,
+    faithful: bool,
+    has_tensor_options: bool,
+) -> List[Binding]:
+    def sub_argument(
+        a: Union[Argument, TensorOptionsArguments, SelfArgument]
+    ) -> List[Binding]:
+        return argument(
+            a,
+            cpp_no_default_args=cpp_no_default_args,
+            method=method,
+            faithful=faithful,
+            has_tensor_options=has_tensor_options,
+        )
+
+    if isinstance(a, Argument):
+        binds: ArgName
+        if a.name == "memory_format" and has_tensor_options:
+            binds = SpecialArgName.possibly_redundant_memory_format
+        else:
+            binds = a.name
+        default: Optional[str] = None
+        if a.name not in cpp_no_default_args and a.default is not None:
+            default = default_expr(a.default, a.type)
+        return [
+            Binding(
+                nctype=argument_type(a, binds=binds),
+                name=a.name,
+                default=default,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, TensorOptionsArguments):
+        raise NotImplementedError("Need to implement type resolution for TensorOptions")
+    elif isinstance(a, SelfArgument):
+        if method:
+            # Caller is responsible for installing implicit this in context!
+            return []
+        else:
+            return sub_argument(a.argument)
+    else:
+        assert_never(a)
+
+
+def arguments(
+    arguments: Arguments,
+    *,
+    faithful: bool,
+    method: bool,
+    cpp_no_default_args: Set[str],
+) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    if faithful:
+        args.extend(arguments.non_out)
+        args.extend(arguments.out)
+    else:
+        args.extend(arguments.out)
+        args.extend(arguments.non_out)
+    return [
+        r.no_default() if faithful else r
+        for a in args
+        for r in argument(
+            a,
+            faithful=faithful,
+            method=method,
+            has_tensor_options=arguments.tensor_options is not None,
+            cpp_no_default_args=cpp_no_default_args,
+        )
+    ]
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/types/__init__.py b/MLPY/Lib/site-packages/torchgen/executorch/api/types/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..883459aedfda7ec339de17e1f83da5d6f955f297
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/executorch/api/types/__init__.py
@@ -0,0 +1,2 @@
+from .types import *
+from .signatures import *  # isort:skip
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/types/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/executorch/api/types/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..000dcac3485d852136b63454ca209c3573ed40f0
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/executorch/api/types/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/types/__pycache__/signatures.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/executorch/api/types/__pycache__/signatures.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8b2f4c7c5bf77a4285ef36ac2bf6bc5c6e4417a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/executorch/api/types/__pycache__/signatures.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/types/__pycache__/types.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/executorch/api/types/__pycache__/types.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f32cb3d3d3acd94415a3d47dc8a8133d0d172f3c
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/executorch/api/types/__pycache__/types.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/types/signatures.py b/MLPY/Lib/site-packages/torchgen/executorch/api/types/signatures.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c9c4dd95f5d1a85932e71691dce2e12b87077c3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/executorch/api/types/signatures.py
@@ -0,0 +1,73 @@
+from dataclasses import dataclass
+from typing import List, Optional, Set
+
+import torchgen.api.cpp as aten_cpp
+
+from torchgen.api.types import Binding, CType
+from torchgen.model import FunctionSchema, NativeFunction
+
+from .types import contextArg
+
+
+@dataclass(frozen=True)
+class ExecutorchCppSignature:
+    """
+    This signature is merely a CppSignature with Executorch types (optionally
+    contains KernelRuntimeContext as well). The inline definition of
+    CppSignature is generated in Functions.h and it's used by unboxing
+    functions.
+    """
+
+    # The schema this signature is derived from
+    func: FunctionSchema
+
+    # The set of C++ arguments which should not have defaults applied to them
+    cpp_no_default_args: Set[str]
+
+    # Allows you to prepend an arbitrary prefix to the signature name.
+    # This is useful for parts of the codegen that generate wrappers around kernels,
+    # and need to avoid naming collisions.
+    prefix: str = ""
+
+    def arguments(self, *, include_context: bool = True) -> List[Binding]:
+        return ([contextArg] if include_context else []) + et_cpp.arguments(
+            self.func.arguments,
+            faithful=True,  # always faithful, out argument at the end
+            method=False,  # method not supported
+            cpp_no_default_args=self.cpp_no_default_args,
+        )
+
+    def name(self) -> str:
+        return self.prefix + aten_cpp.name(
+            self.func,
+            faithful_name_for_out_overloads=True,
+        )
+
+    def decl(self, name: Optional[str] = None, *, include_context: bool = True) -> str:
+        args_str = ", ".join(
+            a.decl() for a in self.arguments(include_context=include_context)
+        )
+        if name is None:
+            name = self.name()
+        return f"{self.returns_type().cpp_type()} {name}({args_str})"
+
+    def defn(self, name: Optional[str] = None) -> str:
+        args = [a.defn() for a in self.arguments()]
+        args_str = ", ".join(args)
+        if name is None:
+            name = self.name()
+        return f"{self.returns_type().cpp_type()} {name}({args_str})"
+
+    def returns_type(self) -> CType:
+        return et_cpp.returns_type(self.func.returns)
+
+    @staticmethod
+    def from_native_function(
+        f: NativeFunction, *, prefix: str = ""
+    ) -> "ExecutorchCppSignature":
+        return ExecutorchCppSignature(
+            func=f.func, prefix=prefix, cpp_no_default_args=f.cpp_no_default_args
+        )
+
+
+from torchgen.executorch.api import et_cpp
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/types/types.py b/MLPY/Lib/site-packages/torchgen/executorch/api/types/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..28b2f03b4b3e1fdc74511920ef88d0f54981dbe1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/executorch/api/types/types.py
@@ -0,0 +1,81 @@
+from dataclasses import dataclass
+from typing import Dict
+
+from torchgen.api.types import (
+    BaseCppType,
+    BaseCType,
+    Binding,
+    boolT,
+    CType,
+    doubleT,
+    Expr,
+    longT,
+    MutRefCType,
+    NamedCType,
+)
+from torchgen.model import BaseTy
+
+halfT = BaseCppType("torch::executor", "Half")
+bfloat16T = BaseCppType("torch::executor", "BFloat16")
+stringT = BaseCppType("torch::executor", "string_view")
+scalarTypeT = BaseCppType("torch::executor", "ScalarType")
+tensorT = BaseCppType("torch::executor", "Tensor")
+tensorListT = BaseCppType("torch::executor", "TensorList")
+scalarT = BaseCppType("torch::executor", "Scalar")
+memoryFormatT = BaseCppType("torch::executor", "MemoryFormat")
+intArrayRefT = BaseCppType("torch::executor", "IntArrayRef")
+optionalT = BaseCppType("torch::executor", "optional")
+contextT = BaseCppType("torch::executor", "KernelRuntimeContext")
+
+contextExpr = Expr(
+    expr="context",
+    type=NamedCType(name="context", type=MutRefCType(BaseCType(contextT))),
+)
+
+contextArg = Binding(
+    name="context",
+    nctype=contextExpr.type,
+    argument=None,  # type: ignore[arg-type]
+    default=None,
+)
+
+BaseTypeToCppMapping: Dict[BaseTy, BaseCppType] = {
+    BaseTy.int: longT,
+    BaseTy.float: doubleT,
+    BaseTy.bool: boolT,
+    BaseTy.str: stringT,
+    BaseTy.ScalarType: scalarTypeT,
+    BaseTy.Tensor: tensorT,
+    BaseTy.Scalar: scalarT,
+    BaseTy.MemoryFormat: memoryFormatT,
+}
+
+
+@dataclass(frozen=True)
+class OptionalCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"torch::executor::optional<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"torch::executor::optional<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return OptionalCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ArrayRefCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"torch::executor::ArrayRef<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"torch::executor::ArrayRef<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return ArrayRefCType(self.elem.remove_const_ref())
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/api/unboxing.py b/MLPY/Lib/site-packages/torchgen/executorch/api/unboxing.py
new file mode 100644
index 0000000000000000000000000000000000000000..9df3a929c1fdeab35f50ec9ab661c1ff9ad6c3af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/executorch/api/unboxing.py
@@ -0,0 +1,213 @@
+from dataclasses import dataclass
+from typing import Callable, List, Sequence, Tuple
+
+from torchgen.api.types import Binding, CType, NamedCType
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Type,
+)
+
+connector = "\n\t"
+
+
+# Return unboxing function name for a NativeFunction
+def name(f: NativeFunction) -> str:
+    return f.func.name.unambiguous_name()
+
+
+@dataclass(frozen=True)
+class Unboxing:
+    """
+    Takes a sequence of Bindings and unbox EValues to these Bindings. Return generated code that performs correct unboxing.
+    A sample generated code:
+    // aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    void mul_out(EValue** stack) {
+        EValue& self = *stack[0];
+        EValue& other = *stack[1];
+        EValue& out = *stack[2];
+        const torch::executor::Tensor & self_base = self.to<torch::executor::Tensor>();
+        const torch::executor::Tensor & other_base = other.to<torch::executor::Tensor>();
+        torch::executor::Tensor & out_base = out.to<torch::executor::Tensor>();
+
+        EXECUTORCH_SCOPE_PROF("native_call_mul.out");
+        torch::executor::mul_outf(self_base, other_base, out_base);
+
+
+    }
+    """
+
+    # this is a callable that converts a JIT argument, into its C++ type.
+    # Translates (type, mutability, binds) to NamedCType. E.g., torchgen.api.cpp.argumenttype_type.
+    argument_type_gen: Callable[
+        ...,
+        NamedCType,
+    ]
+
+    # Convert all the arguments in a NativeFunction to C++ code
+    def convert_arguments(
+        self, args: Sequence[Binding]
+    ) -> Tuple[List[Binding], List[str]]:
+        code_list = [f"EValue& {args[i].name} = *stack[{i}];" for i in range(len(args))]
+        binding_list = []
+        for arg in args:
+            # expecting only Argument
+            if not isinstance(arg.argument, Argument):
+                raise Exception(
+                    f"Unexpected argument type, expecting `Argument` but got {arg}"
+                )
+            argument: Argument = arg.argument
+            unboxed_name, _, code, decl = self.argumenttype_evalue_convert(
+                argument.type, argument.name, mutable=argument.is_write
+            )
+            code_list.extend(decl)
+            code_list.extend(code)
+            binding_list.append(arg.with_name(unboxed_name))
+        return binding_list, code_list
+
+    def argumenttype_evalue_convert(
+        self, t: Type, arg_name: str, *, mutable: bool = False
+    ) -> Tuple[str, CType, List[str], List[str]]:
+        """
+        Takes in the type, name and mutability corresponding to an argument, and generates a tuple of:
+        (1) the C++ code necessary to unbox the argument
+        (2) A Binding corresponding to the newly created unboxed variable, including variable name and its CType
+        :param t: a `Type` of an argument
+        :param arg_name: argument name
+        :param mutable: boolean for whether this argument type is mutable
+        :return: unboxed result
+        """
+        ctype = self.argument_type_gen(t, mutable=mutable, binds=arg_name).type
+
+        if isinstance(t, BaseType):
+            out_name = f"{arg_name}_base"
+            code, decl = self._gen_code_base_type(
+                arg_name=arg_name, out_name=out_name, ctype=ctype
+            )
+        elif isinstance(t, OptionalType):
+            out_name = f"{arg_name}_opt_out"
+            code, decl = self._gen_code_optional_type(
+                arg_name=arg_name, out_name=out_name, t=t, ctype=ctype
+            )
+        elif isinstance(t, ListType):
+            out_name = f"{arg_name}_list_out"
+            code, decl = self._gen_code_list_type(
+                arg_name=arg_name, out_name=out_name, t=t, ctype=ctype
+            )
+        else:
+            raise Exception(f"Cannot handle type {t}. arg_name: {arg_name}")
+        return out_name, ctype, code, decl
+
+    def _gen_code_base_type(
+        self, arg_name: str, out_name: str, ctype: CType
+    ) -> Tuple[List[str], List[str]]:
+        return [
+            f"{ctype.cpp_type()} {out_name} = {arg_name}.to<{ctype.cpp_type(strip_ref=True)}>();"
+        ], []
+
+    def _gen_code_optional_type(
+        self, arg_name: str, out_name: str, t: OptionalType, ctype: CType
+    ) -> Tuple[List[str], List[str]]:
+        in_name = f"{arg_name}_opt_in"
+        res_name, base_type, res_code, decl = self.argumenttype_evalue_convert(
+            t.elem, in_name
+        )
+        return (
+            f"""
+    {ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.toOptional<{base_type.cpp_type(strip_ref=True)}>();
+            """.split(
+                "\n"
+            ),
+            decl,
+        )
+
+    def _gen_code_list_type(
+        self, arg_name: str, out_name: str, t: ListType, ctype: CType
+    ) -> Tuple[List[str], List[str]]:
+        in_name = f"{arg_name}_list_in"
+        elem_name = f"{arg_name}_elem"
+        code = []
+        res_name, res_ctype, res_code, decl = self.argumenttype_evalue_convert(
+            t.elem, elem_name
+        )
+
+        if isinstance(t.elem, BaseType) and t.elem.name == BaseTy.Tensor:
+            code.extend(
+                f"""
+    {ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.toTensorList();
+                """.split(
+                    "\n"
+                )
+            )
+        elif isinstance(t.elem, BaseType) and (
+            t.elem.name == BaseTy.int or t.elem.name == BaseTy.SymInt
+        ):
+            code.extend(
+                f"""
+    {ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.toIntList();
+                """.split(
+                    "\n"
+                )
+            )
+        elif isinstance(t.elem, BaseType) and t.elem.name == BaseTy.float:
+            code.extend(
+                f"""
+    {ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.toDoubleList();
+                """.split(
+                    "\n"
+                )
+            )
+        elif isinstance(t.elem, BaseType) and t.elem.name == BaseTy.bool:
+            # handle list type with size, e.g., bool[4]
+            code.extend(
+                f"""
+    {ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.toBoolList();
+                """.split(
+                    "\n"
+                )
+            )
+        # pytorch codegen:
+        # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<c10::optional<at::Tensor>>
+        elif (
+            isinstance(t.elem, OptionalType)
+            and isinstance(t.elem.elem, BaseType)
+            and t.elem.elem.name == BaseTy.Tensor
+        ):
+            code.extend(
+                f"""
+#ifdef USE_ATEN_LIB
+at::ArrayRef<c10::optional<at::Tensor>> {in_name} = {arg_name}.toListOptionalTensor();
+c10::List<c10::optional<at::Tensor>> {out_name};
+for (auto {elem_name}: {in_name}) {{
+    {out_name}.push_back({elem_name});
+}}
+#else
+torch::executor::ArrayRef<torch::executor::optional<torch::executor::Tensor>> {out_name} = {arg_name}.toListOptionalTensor();
+#endif
+                """.split(
+                    "\n"
+                )
+            )
+        else:
+            # use ArrayRef as default.
+            vec_name = arg_name + "_vec"
+            # need to bring vector instantiation out of scope so that ArrayRef has valid data
+            decl.append(
+                f"std::vector<{res_ctype.cpp_type(strip_ref=True)}> {vec_name};"
+            )
+            code.extend(
+                f"""
+    for (EValue {elem_name}: {in_name}) {{
+        {connector.join(res_code)}
+        {vec_name}.push_back({res_name});
+    }}
+    {ctype.cpp_type(strip_ref=True)} {out_name}({vec_name});
+                """.split(
+                    "\n"
+                )
+            )
+        return code, decl
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/model.py b/MLPY/Lib/site-packages/torchgen/executorch/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..48384e687403f4b04dbaa8a4ecfc97c4934606d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/executorch/model.py
@@ -0,0 +1,220 @@
+# Represents all kernels used by an Executorch model.
+# It maintains a Dict[OperatorName, Dict[ETKernelKey, BackendMetadata]] structure.
+
+import itertools
+from collections import defaultdict, namedtuple
+from dataclasses import dataclass
+from enum import IntEnum
+from typing import Dict, List, Tuple, Union
+
+from torchgen.model import (
+    BackendIndex,
+    BackendMetadata,
+    DispatchKey,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OperatorName,
+)
+from torchgen.utils import assert_never
+
+KERNEL_KEY_VERSION = 1
+
+
+# TODO: Duplicated Subset from codegen.tool.gen_oplist, remove declaration in codegen
+class ScalarType(IntEnum):
+    Byte = 0
+    Char = 1
+    Short = 2
+    Int = 3
+    Long = 4
+    Float = 6
+    Double = 7
+    Bool = 11
+
+
+ETParsedYaml = namedtuple("ETParsedYaml", ["native_functions", "kernel_index"])
+
+
+@dataclass(frozen=True)
+class ETKernelKeyOpArgMeta:
+    arg_name: str
+    dtype: str
+    # The order of the dimensions if entry is a Tensor
+    dim_order: Tuple[int, ...]
+
+    def to_native_string(self) -> str:
+        dtype_str = ScalarType[self.dtype].value
+        dim_str = str(self.dim_order)[1:-1].replace(" ", "")
+        return f"{dtype_str};{dim_str}"
+
+
+@dataclass(frozen=True)
+class ETKernelKey:
+    # Field undefined is default = True
+    arg_meta: Tuple[ETKernelKeyOpArgMeta, ...] = ()
+
+    # Indicator for this kernel being used as a catch all
+    default: bool = False
+
+    version: int = KERNEL_KEY_VERSION
+
+    @staticmethod
+    def gen_from_yaml(
+        args: Dict[str, Tuple[str, str]],
+        type_alias_map: Dict[str, List[str]],  # TODO: Support unwrapped str val
+        dim_order_alias_map: Dict[str, List[int]],
+    ) -> List["ETKernelKey"]:
+        """Generate ETKernelKeys from arg kernel specs
+        Multiple ETKernelKeys are returned due to dtype permutations from utilizing
+        type_alias_map (actualizing each potential type permutation as a KernelKey)
+
+        Args:
+            args: Mapping from argument name to kernel specs
+                Kernel specs are a tuple of (dtype, dim_order).
+                Currently tuple entries must be aliased via the alias map arguments
+            type_alias_map: Mapping from type alias to potential type enums
+                i.e { T0 : [Double, Int] } means T0 can be either Double or Int
+                Used for lookup by args
+            dim_order_alias_map: Mapping from alias to a list of dimension orders
+                Used for lookup by args
+        """
+        # Cast to dim order to int
+        dim_order_alias_map = {
+            k: [int(alias) for alias in v] for k, v in dim_order_alias_map.items()
+        }
+        kernel_keys = []
+
+        # Get all used Dtype Alias
+        dtype_alias_used = set()
+        for type_alias, dim_order in args.values():
+            # Enforce usage of alias initially
+            # TODO: Support inlined arguments
+            assert type_alias in type_alias_map, "Undefined type alias: " + str(
+                type_alias
+            )
+            assert (
+                dim_order in dim_order_alias_map
+            ), "Undefined dim_order alias: " + str(dim_order)
+            dtype_alias_used.add(type_alias)
+
+        # Generate all permutations of dtype alias values
+        alias_dtypes = [
+            [(alias, dtype) for dtype in type_alias_map[alias]]
+            for alias in dtype_alias_used
+        ]
+        alias_permutations = [
+            dict(permutation) for permutation in list(itertools.product(*alias_dtypes))
+        ]
+
+        # Using each alias value permutation, generate kernel keys
+        op_arg_cache = {}
+        for permutation in alias_permutations:
+            arg_list = []
+            for arg_name, arg_spec in args.items():
+                dtype = permutation[arg_spec[0]]
+                dim_order = dim_order_alias_map[arg_spec[1]]  # type: ignore[assignment]
+                if (
+                    cache_key := (arg_name, dtype, tuple(dim_order))
+                ) not in op_arg_cache:
+                    op_arg_cache[cache_key] = ETKernelKeyOpArgMeta(*cache_key)  # type: ignore[arg-type]
+
+                arg_list.append(op_arg_cache[cache_key])
+            kernel_keys.append(ETKernelKey(tuple(arg_list)))
+
+        return kernel_keys
+
+    def to_native_string(self) -> str:
+        if self.default:
+            return "default"
+        return (
+            "v"
+            + str(KERNEL_KEY_VERSION)
+            + "/"
+            + "|".join([arg.to_native_string() for arg in self.arg_meta])
+        )
+
+
+@dataclass(frozen=True)
+class ETKernelIndex:
+    index: Dict[OperatorName, Dict[ETKernelKey, BackendMetadata]]
+
+    def has_kernels(self, g: Union[NativeFunction, NativeFunctionsGroup]) -> bool:
+        m = self.get_kernels(g)
+        return m is not None
+
+    def get_kernels(
+        self, g: Union[NativeFunction, NativeFunctionsGroup]
+    ) -> Dict[ETKernelKey, BackendMetadata]:
+        if isinstance(g, NativeFunction):
+            f = g
+        elif isinstance(g, NativeFunctionsGroup):
+            f = g.functional
+        else:
+            assert_never(g)
+        if f.func.name not in self.index:
+            return {}
+        return self.index[f.func.name]
+
+    @staticmethod
+    def grow_from_backend_indices(
+        kernel_index: Dict[OperatorName, Dict[ETKernelKey, BackendMetadata]],
+        backend_indices: Dict[DispatchKey, Dict[OperatorName, BackendMetadata]],
+    ) -> None:
+        for dk in backend_indices:
+            index = backend_indices[dk]
+            for op, backend_metadata in index.items():
+                if op in kernel_index:
+                    kernel_index[op][ETKernelKey(default=True)] = backend_metadata
+                else:
+                    kernel_index[op] = {ETKernelKey(default=True): backend_metadata}
+
+    @staticmethod
+    def from_backend_indices(
+        backend_indices: Dict[DispatchKey, Dict[OperatorName, BackendMetadata]]
+    ) -> "ETKernelIndex":
+        kernel_index: Dict[
+            OperatorName, Dict[ETKernelKey, BackendMetadata]
+        ] = defaultdict(dict)
+        ETKernelIndex.grow_from_backend_indices(kernel_index, backend_indices)
+        return ETKernelIndex(kernel_index)
+
+    def grow(
+        self, backend_indices: Dict[DispatchKey, Dict[OperatorName, BackendMetadata]]
+    ) -> "ETKernelIndex":
+        ETKernelIndex.grow_from_backend_indices(self.index, backend_indices)
+        return self
+
+    def _to_backend_index(self) -> BackendIndex:
+        """
+        WARNING: this will be deprecated once all the codegen places know how to handle ETKernelIndex.
+        """
+        index: Dict[OperatorName, BackendMetadata] = {}
+        for op in self.index:
+            kernel_dict = self.index[op]
+            assert (
+                len(kernel_dict.values()) == 1
+            ), f"Can't convert ETKernelIndex to BackendIndex because {op} has more than one kernels. Got {kernel_dict}"
+            index[op] = kernel_dict.get(
+                ETKernelKey(default=True),
+                BackendMetadata(kernel="", structured=False, cpp_namespace=""),
+            )
+        return BackendIndex(
+            dispatch_key=DispatchKey.CPU,
+            use_out_as_primary=False,
+            device_guard=False,
+            external=False,
+            index=index,
+        )
+
+    # Note duplicate ETKernelKey from index_b will clobber the metadata from index_a
+    @staticmethod
+    def merge_indices(
+        index_a: "ETKernelIndex", index_b: "ETKernelIndex"
+    ) -> "ETKernelIndex":
+        combined = defaultdict(dict, index_a.index.copy())
+
+        for op, entry in index_b.index.items():
+            for key, metadata in entry.items():
+                combined[op][key] = metadata
+
+        return ETKernelIndex(combined)
diff --git a/MLPY/Lib/site-packages/torchgen/executorch/parse.py b/MLPY/Lib/site-packages/torchgen/executorch/parse.py
new file mode 100644
index 0000000000000000000000000000000000000000..275fb339a84972613bbac0988fc2d9116552c675
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/executorch/parse.py
@@ -0,0 +1,151 @@
+from collections import defaultdict, namedtuple
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+import yaml
+
+from torchgen.executorch.model import ETKernelIndex, ETKernelKey
+
+from torchgen.gen import LineLoader, parse_native_yaml
+from torchgen.model import (
+    BackendMetadata,
+    DispatchKey,
+    FunctionSchema,
+    NativeFunction,
+    OperatorName,
+)
+from torchgen.utils import NamespaceHelper
+
+# Parse native_functions.yaml into a sequence of NativeFunctions and ET Backend Indices.
+ETParsedYaml = namedtuple("ETParsedYaml", ["native_functions", "et_kernel_indices"])
+
+# Fields in native_functions.yaml used to determine which kernels should be used
+ET_FIELDS = ["kernels", "type_alias", "dim_order_alias"]
+
+
+def parse_from_yaml(ei: Dict[str, object]) -> Dict[ETKernelKey, BackendMetadata]:
+    """Given a loaded yaml representing kernel assignment information, extract the
+    mapping from `kernel keys` to `BackendMetadata` (the latter representing the kernel instance)
+
+    Args:
+        ei: Dict keys {kernels, type_alias, dim_order_alias}
+            See ETKernelKey for description of arguments
+    """
+    e = ei.copy()
+    if (kernels := e.pop("kernels", None)) is None:
+        return {}
+
+    type_alias: Dict[str, List[str]] = e.pop("type_alias", {})  # type: ignore[assignment]
+    dim_order_alias: Dict[str, List[str]] = e.pop("dim_order_alias", {})  # type: ignore[assignment]
+    dim_order_alias.pop("__line__", None)
+
+    kernel_mapping: Dict[ETKernelKey, BackendMetadata] = {}
+
+    for entry in kernels:  # type: ignore[attr-defined]
+        arg_meta = entry.get("arg_meta")
+        if arg_meta is not None:
+            arg_meta.pop("__line__")
+
+        kernel_name = entry.get("kernel_name")
+        namespace_helper = NamespaceHelper.from_namespaced_entity(
+            kernel_name, max_level=3
+        )
+        kernel_namespace = namespace_helper.get_cpp_namespace(default="at")
+        backend_metadata = BackendMetadata(
+            kernel=namespace_helper.entity_name,
+            structured=False,
+            cpp_namespace=(kernel_namespace + "::native"),
+        )
+
+        kernel_keys = (
+            [ETKernelKey((), default=True)]
+            if arg_meta is None
+            else ETKernelKey.gen_from_yaml(arg_meta, type_alias, dim_order_alias)  # type: ignore[arg-type]
+        )
+
+        for kernel_key in kernel_keys:
+            assert kernel_key not in kernel_mapping, (
+                "Duplicate kernel key: " + str(kernel_key) + " " + str(e)
+            )
+            kernel_mapping[kernel_key] = backend_metadata
+
+    return kernel_mapping
+
+
+def parse_et_yaml_struct(es: object) -> ETKernelIndex:
+    """Given a loaded yaml representing a list of operators, for each op extract the mapping
+    of `kernel keys` to `BackendMetadata` (the latter representing the kernel instance
+    that should be used by the kernel key).
+    """
+    indices: Dict[OperatorName, Dict[ETKernelKey, BackendMetadata]] = {}
+    for ei in es:  # type: ignore[attr-defined]
+        e = ei.copy()
+
+        funcs = e.pop("func")
+        assert isinstance(funcs, str), f"not a str: {funcs}"
+        namespace_helper = NamespaceHelper.from_namespaced_entity(
+            namespaced_entity=funcs, max_level=1
+        )
+        opname = FunctionSchema.parse(namespace_helper.entity_name).name
+
+        assert opname not in indices, f"Duplicate func found in yaml: {opname} already"
+
+        if len(index := parse_from_yaml(e)) != 0:
+            indices[opname] = index
+
+    return ETKernelIndex(indices)
+
+
+def extract_kernel_fields(es: object) -> Dict[OperatorName, Dict[str, Any]]:
+    """Given a loaded yaml representing a list of operators, extract the
+    kernel key related fields indexed by the operator name.
+    """
+    fields: Dict[OperatorName, Dict[str, Any]] = defaultdict(dict)
+    for ei in es:  # type: ignore[attr-defined]
+        funcs = ei.get("func")
+        assert isinstance(funcs, str), f"not a str: {funcs}"
+        namespace_helper = NamespaceHelper.from_namespaced_entity(
+            namespaced_entity=funcs, max_level=1
+        )
+        opname = FunctionSchema.parse(namespace_helper.entity_name).name
+
+        for field in ET_FIELDS:
+            if (value := ei.get(field)) is not None:
+                fields[opname][field] = value
+
+    return fields
+
+
+def parse_et_yaml(
+    path: str,
+    tags_yaml_path: str,
+    ignore_keys: Optional[Set[DispatchKey]] = None,
+    skip_native_fns_gen: bool = False,
+) -> Tuple[List[NativeFunction], Dict[OperatorName, Dict[str, Any]]]:
+    """Parse native_functions.yaml into NativeFunctions and an Operator Indexed Dict
+    of fields to persist from native_functions.yaml to functions.yaml
+    """
+    with open(path) as f:
+        es = yaml.load(f, Loader=LineLoader)
+
+    et_kernel = extract_kernel_fields(es)
+
+    # Remove ET specific fields from entries for BC compatibility
+    strip_et_fields(es)
+
+    native_yaml = parse_native_yaml(
+        path,
+        tags_yaml_path,
+        ignore_keys,
+        skip_native_fns_gen=skip_native_fns_gen,
+        loaded_yaml=es,
+    )
+    return native_yaml.native_functions, et_kernel
+
+
+def strip_et_fields(es: object) -> None:
+    """Given a loaded yaml representing a list of operators,
+    remove ET specific fields from every entries for BC compatibility
+    """
+    for entry in es:  # type: ignore[attr-defined]
+        for field in ET_FIELDS:
+            entry.pop(field, None)
diff --git a/MLPY/Lib/site-packages/torchgen/gen.py b/MLPY/Lib/site-packages/torchgen/gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e39839c020d04948fe9758bf3200c6230766831
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/gen.py
@@ -0,0 +1,2937 @@
+import argparse
+import functools
+import json
+import os
+import pathlib
+from collections import defaultdict, namedtuple, OrderedDict
+from dataclasses import dataclass, field
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+import yaml
+
+import torchgen.api.dispatcher as dispatcher
+import torchgen.api.meta as meta
+import torchgen.api.native as native
+import torchgen.api.structured as structured
+import torchgen.dest as dest
+
+from torchgen.api import cpp
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    Binding,
+    CppSignature,
+    CppSignatureGroup,
+    DispatcherSignature,
+    NamedCType,
+    NativeSignature,
+    SpecialArgName,
+)
+from torchgen.context import (
+    method_with_native_function,
+    native_function_manager,
+    with_native_function,
+    with_native_function_and_indices,
+)
+from torchgen.gen_aoti_c_shim import (
+    gen_aoti_c_shim,
+    gen_static_dispatch_backend_call_signature,
+    get_backend_index_for_aoti,
+)
+from torchgen.gen_functionalization_type import (
+    gen_functionalization_definition,
+    gen_functionalization_registration,
+    gen_functionalization_view_inverse_declaration,
+    GenCompositeViewCopyKernel,
+)
+from torchgen.gen_vmap_plumbing import gen_all_vmap_plumbing
+
+from torchgen.model import (
+    Argument,
+    BackendIndex,
+    BackendMetadata,
+    BaseOperatorName,
+    DEFAULT_KERNEL_NAMESPACE,
+    DispatchKey,
+    FRAGMENT_NAMESPACES,
+    FunctionSchema,
+    is_cuda_dispatch_key,
+    is_generic_dispatch_key,
+    is_ufunc_dispatch_key,
+    Location,
+    NativeFunction,
+    NativeFunctionsGroup,
+    NativeFunctionsViewGroup,
+    OperatorName,
+    OptionalType,
+    SchemaKind,
+    SelfArgument,
+    STRUCTURED_DISPATCH_KEYS,
+    TensorOptionsArguments,
+    Type,
+    Variant,
+    ViewSchemaKind,
+)
+from torchgen.native_function_generation import (
+    add_generated_native_functions,
+    gen_composite_functional_kernel,
+    gen_composite_out_kernel,
+    pre_group_native_functions,
+)
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import (
+    assert_never,
+    concatMap,
+    context,
+    FileManager,
+    make_file_manager,
+    mapMaybe,
+    NamespaceHelper,
+    Target,
+)
+from torchgen.yaml_utils import YamlDumper, YamlLoader
+
+T = TypeVar("T")
+
+# Welcome to the ATen code generator v2!  The ATen code generator is
+# responsible for parsing native_functions.yaml and then generating
+# various generated files (e.g., TypeDefault.cpp) based on the operators
+# defined in this file.  This means that the code generator knows how to
+# parse function schema, and then translate this into various C++ types
+# and boilerplate code.
+#
+# Some things to know about this file when you modify it:
+#
+# - This file has STRICT mypy typechecking.  Typecheck it with
+#   `mypy --config mypy-strict.ini` in the root source directory
+#
+# - Most of the heavy lifting lives in external modules:
+#   - 'model' has the data model for native_functions.yaml.  The classes
+#     in those file represent what you see when you look at
+#     a native_functions.yaml
+#   - 'api' has conversions for how to translate JIT schema into
+#     the various C++ APIs that the codegen interacts with.  There
+#     are in fact THREE different C++ APIs: the public C++ API,
+#     the dispatcher API, and the legacy dispatcher API.  See each
+#     of these respective files for more information
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                         HELPER FUNCTIONS
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+# A custom loader for YAML to let us also keep track of line numbers
+# of each entry in the YAML file
+class LineLoader(YamlLoader):
+    def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
+        mapping = super().construct_mapping(node, deep=deep)  # type: ignore[no-untyped-call]
+        # Add 1 so line numbering starts at 1
+        mapping["__line__"] = node.start_mark.line + 1
+        return mapping
+
+
+# Parse native_functions.yaml into a sequence of NativeFunctions and Backend Indices.
+ParsedYaml = namedtuple("ParsedYaml", ["native_functions", "backend_indices"])
+
+
+_GLOBAL_PARSE_NATIVE_YAML_CACHE: Dict[str, ParsedYaml] = {}
+_GLOBAL_PARSE_TAGS_YAML_CACHE: Dict[str, Set[str]] = {}
+
+
+def parse_native_yaml_struct(
+    es: object,
+    valid_tags: Set[str],
+    ignore_keys: Optional[Set[DispatchKey]] = None,
+    path: str = "<stdin>",
+    skip_native_fns_gen: bool = False,
+) -> ParsedYaml:
+    assert isinstance(es, list)
+    rs: List[NativeFunction] = []
+    bs: Dict[DispatchKey, Dict[OperatorName, BackendMetadata]] = defaultdict(dict)
+    for e in es:
+        assert isinstance(e.get("__line__"), int), e
+        loc = Location(path, e["__line__"])
+        funcs = e.get("func")
+        with context(lambda: f"in {loc}:\n  {funcs}"):
+            func, m = NativeFunction.from_yaml(e, loc, valid_tags, ignore_keys)
+            rs.append(func)
+            BackendIndex.grow_index(bs, m)
+    error_check_native_functions(rs)
+    # Default dict is to prevent the codegen from barfing when we have a dispatch key that has no kernels yet.
+    indices: Dict[DispatchKey, BackendIndex] = defaultdict(
+        lambda: BackendIndex(
+            dispatch_key=DispatchKey.Undefined,
+            use_out_as_primary=True,
+            external=False,
+            device_guard=False,
+            # I'm actually not sure about this; undefined could be hit on
+            # empty TensorList, hypothetically that could have sizes in it
+            index={},
+        )
+    )
+    if not skip_native_fns_gen:
+        add_generated_native_functions(rs, bs)
+    for k, v in bs.items():
+        # All structured in-tree operators are implemented in terms of their out operator.
+        indices[k] = BackendIndex(
+            dispatch_key=k,
+            use_out_as_primary=True,
+            external=False,
+            # Only cuda-like devices in tree require device guards
+            device_guard=is_cuda_dispatch_key(k),
+            index=v,
+        )
+    return ParsedYaml(rs, indices)
+
+
+def parse_tags_yaml_struct(es: object, path: str = "<stdin>") -> Set[str]:
+    assert isinstance(es, list)
+    rs: Set[str] = set()
+    for e in es:
+        assert isinstance(e.get("__line__"), int), e
+        loc = Location(path, e["__line__"])
+        tags = e.get("tag")
+        with context(lambda: f"in {loc}:\n  {tags}"):
+            e_i = e.copy()
+            name = e_i.pop("tag")
+            desc = e_i.pop("desc", "")
+            # ensure that each tag has a non-empty description
+            assert desc != ""
+            rs.add(name)
+    return rs
+
+
+@functools.lru_cache(maxsize=None)
+def parse_tags_yaml(path: str) -> Set[str]:
+    global _GLOBAL_PARSE_TAGS_YAML_CACHE
+    if path not in _GLOBAL_PARSE_TAGS_YAML_CACHE:
+        with open(path) as f:
+            es = yaml.load(f, Loader=LineLoader)
+            _GLOBAL_PARSE_TAGS_YAML_CACHE[path] = parse_tags_yaml_struct(es, path=path)
+
+    return _GLOBAL_PARSE_TAGS_YAML_CACHE[path]
+
+
+def parse_native_yaml(
+    path: str,
+    tags_yaml_path: str,
+    ignore_keys: Optional[Set[DispatchKey]] = None,
+    *,
+    skip_native_fns_gen: bool = False,
+    loaded_yaml: Optional[object] = None,
+) -> ParsedYaml:
+    global _GLOBAL_PARSE_NATIVE_YAML_CACHE
+    if path not in _GLOBAL_PARSE_NATIVE_YAML_CACHE:
+        valid_tags = parse_tags_yaml(tags_yaml_path)
+
+        # if a loaded yaml is provided, use that instead of reading from path
+        if loaded_yaml is None:
+            with open(path) as f:
+                es = yaml.load(f, Loader=LineLoader)
+        else:
+            es = loaded_yaml
+
+        _GLOBAL_PARSE_NATIVE_YAML_CACHE[path] = parse_native_yaml_struct(
+            es,
+            valid_tags,
+            ignore_keys,
+            path=path,
+            skip_native_fns_gen=skip_native_fns_gen,
+        )
+
+    return _GLOBAL_PARSE_NATIVE_YAML_CACHE[path]
+
+
+# Some assertions are already performed during parsing, but those are only within a single NativeFunction.
+# Assertions here are meant to be performed across NativeFunctions.
+def error_check_native_functions(funcs: Sequence[NativeFunction]) -> None:
+    func_map: Dict[OperatorName, NativeFunction] = {}
+    base_func_map: Dict[BaseOperatorName, List[NativeFunction]] = defaultdict(list)
+    for f in funcs:
+        func_map[f.func.name] = f
+        base_func_map[f.func.name.name].append(f)
+    for f in funcs:
+        if f.structured_delegate is not None:
+            delegate_func = func_map[f.structured_delegate]
+            assert delegate_func.structured, (
+                f"{f.func.name} is marked as a structured_delegate pointing to "
+                f"{f.structured_delegate}, but {f.structured_delegate} is not marked as structured. "
+                f"Consider adding 'structured=True' to the delegated operator"
+            )
+        # See Note [resize_ in Functionalization]
+        # resize_() is technically an inplace view op (and therefore needs the tag),
+        # but it would be overkill to add a true "view" variant of resize.
+        # Instead, resize_() gets special treatment in functionalization,
+        # and we have a resize() op that is non-aliasing + functional.
+        if (
+            "inplace_view" in f.tags
+            and str(f.func.name) != "resize_"
+            and str(f.func.name) != "resize_as_"
+            and str(f.func.name.name) != "set_"
+        ):
+            base_name = f.func.name.name
+            overload_name = f.func.name.overload_name
+            assert base_name.inplace, (
+                f"{f.func.name} is marked with tag: inplace_view, but it doesn't follow the naming "
+                "convention for inplace ops - the codegen expects the base name to have a trailing underscore. "
+            )
+            out_of_place_base_name = BaseOperatorName(
+                base_name.base, False, base_name.dunder_method
+            )
+            assert len(base_func_map[out_of_place_base_name]) > 0, (
+                f"{f.func.name} is marked with tag: inplace_view. The codegen expects there to be a corresponding "
+                f"out-of-place view op with the name '{base_name}' and matching schema, but it didn't find one. "
+            )
+
+
+def cpp_string(s: str) -> str:
+    """Convert a python string into a c++ string literal"""
+    s = s.replace("\\", "\\\\")
+    s = s.replace('"', '\\"')
+    s = s.replace("\a", "\\a")
+    s = s.replace("\b", "\\b")
+    s = s.replace("\f", "\\f")
+    s = s.replace("\n", "\\n")
+    s = s.replace("\v", "\\v")
+    s = s.replace("\t", "\\t")
+    return f'"{s}"'
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                        C++ CODE GENERATION
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+# Most functions in this section are curried: they consist of a function
+# that takes some parameters (e.g., what is to be generated) which itself
+# returns a function that actually maps NativeFunction to the code
+# to be generated.  This pattern makes it convenient to use map, concatMap
+# and similar functional combinators.
+
+
+def static_dispatch_keys(backends: List[BackendIndex]) -> List[DispatchKey]:
+    if len(backends) == 0:
+        return []
+    else:
+        return [backend.dispatch_key for backend in backends] + [
+            DispatchKey.CompositeImplicitAutograd,
+            DispatchKey.CompositeImplicitAutogradNestedTensor,
+            DispatchKey.CompositeExplicitAutograd,
+            DispatchKey.CompositeExplicitAutogradNonFunctional,
+        ]
+
+
+def get_static_dispatch_backend(
+    f: NativeFunction, backend_index: BackendIndex
+) -> Optional[DispatchKey]:
+    if f.structured_delegate is not None or backend_index.has_kernel(f):
+        # TODO: for ops with structured_delegate it should check the dispatch table of
+        # the out variant instead. For now, these structured ops all have CPU/CUDA kernels
+        # so we always dispatch to the `backend`, but this could be wrong when we
+        # migrate math/default_backend ops to use structured delegate.
+        return backend_index.dispatch_key
+    elif f.has_composite_explicit_autograd_kernel:
+        return DispatchKey.CompositeExplicitAutograd
+    elif f.has_composite_explicit_autograd_non_functional_kernel:
+        return DispatchKey.CompositeExplicitAutogradNonFunctional
+    elif f.has_composite_implicit_autograd_kernel:
+        return DispatchKey.CompositeImplicitAutograd
+    elif f.has_composite_implicit_autograd_nested_tensor_kernel:
+        return DispatchKey.CompositeImplicitAutogradNestedTensor
+    return None
+
+
+def static_dispatch_ops_header(
+    f: NativeFunction, backend_index: List[BackendIndex]
+) -> Optional[str]:
+    if backend_index is None or f.manual_kernel_registration:
+        return None
+
+    output = []
+    for index in backend_index:
+        dispatch_key = get_static_dispatch_backend(f, index)
+        if dispatch_key is not None:
+            output.append(
+                f"#include <ATen/ops/{f.root_name}_{dispatch_key.lower()}_dispatch.h>"
+            )
+    return "\n".join(output)
+
+
+def static_dispatch_extra_headers(backends: List[BackendIndex]) -> List[str]:
+    return [
+        f"#include <ATen/{dispatch_key}Functions.h>"
+        for dispatch_key in static_dispatch_keys(backends)
+    ]
+
+
+# Translates arguments of `sig` to CppSignature bindings.
+# Note that we have a special case for `memory_format` argument and this case is not covered by
+# tools.codegen.api.translate() yet as its application is limited to static dispatch.
+def translate_args(
+    sig: Union[CppSignature, DispatcherSignature],
+    cpp_sig: CppSignature,
+) -> str:
+    # Adds SpecialArgName.possibly_redundant_memory_format NamedCType for memory_format bindings
+    def add_spl_memory_format_binding(input_bindings: List[Binding]) -> List[Binding]:
+        output_bindings: List[Binding] = []
+        for binding in input_bindings:
+            if binding.name == "memory_format":
+                spl_mem_format_binding = Binding(
+                    nctype=NamedCType(
+                        SpecialArgName.possibly_redundant_memory_format,
+                        binding.nctype.type,
+                    ),
+                    name=binding.name,
+                    default=binding.default,
+                    argument=binding.argument,
+                )
+                output_bindings.append(spl_mem_format_binding)
+            else:
+                output_bindings.append(binding)
+        return output_bindings
+
+    src_bindings = list(sig.arguments())
+    goal_bindings = list(cpp_sig.arguments())
+    # When last argument of CPP signature has SpecialArgName.possibly_redundant_memory_format NCType,
+    # get memory_format bindings of dispatcher signature to have the same NCType as well
+    for arg in goal_bindings:
+        if arg.nctype.name == SpecialArgName.possibly_redundant_memory_format:
+            src_bindings = add_spl_memory_format_binding(src_bindings)
+            break
+    exprs = translate(src_bindings, goal_bindings)
+    return ", ".join(a.expr for a in exprs)
+
+
+def generate_static_dispatch_backend_call(
+    sig: Union[CppSignature, DispatcherSignature],
+    f: NativeFunction,
+    backend_index: BackendIndex,
+) -> str:
+    cpp_sig = gen_static_dispatch_backend_call_signature(sig, f)
+    name = cpp_sig.name()
+    exprs = translate_args(sig, cpp_sig)
+    backend_metadata = backend_index.get_kernel(f)
+    kernel_ns = (
+        backend_metadata.cpp_namespace
+        if backend_metadata and backend_metadata.cpp_namespace
+        else DEFAULT_KERNEL_NAMESPACE
+    )
+    ns = kernel_ns.replace("::native", "")
+    return f"return {ns}::{backend_index.dispatch_key.lower()}::{name}({exprs});"
+
+
+def generate_static_dispatch_fallback_call(
+    sig: Union[CppSignature, DispatcherSignature],
+    f: NativeFunction,
+    backend_indices: List[BackendIndex],
+) -> str:
+    cpp_sigs = CppSignatureGroup.from_native_function(
+        f, method=False, fallback_binding=False
+    )
+    if sig.symint and f.func.has_symint():
+        cpp_sig = cpp_sigs.symint_signature
+    else:
+        cpp_sig = cpp_sigs.signature
+    assert cpp_sig is not None
+    name = cpp_sig.name()
+    exprs = translate_args(sig, cpp_sig)
+    ns = DEFAULT_KERNEL_NAMESPACE.replace("::native", "")
+    if f.has_composite_explicit_autograd_kernel:
+        return f"return {ns}::{DispatchKey.CompositeExplicitAutograd.lower()}::{name}({exprs});"
+    elif f.has_composite_explicit_autograd_non_functional_kernel:
+        return f"return {ns}::{DispatchKey.CompositeExplicitAutogradNonFunctional.lower()}::{name}({exprs});"
+    elif f.has_composite_implicit_autograd_kernel:
+        return f"return {ns}::{DispatchKey.CompositeImplicitAutograd.lower()}::{name}({exprs});"
+    elif f.has_composite_implicit_autograd_nested_tensor_kernel:
+        return f"return {ns}::{DispatchKey.CompositeImplicitAutogradNestedTensor.lower()}::{name}({exprs});"
+    else:
+        return f"""TORCH_CHECK(false, "Static dispatch does not support {name} for\
+{', '.join([str(index.dispatch_key)for index in backend_indices])} ");"""
+
+
+def static_dispatch(
+    sig: Union[CppSignature, DispatcherSignature],
+    f: NativeFunction,
+    backend_indices: List[BackendIndex],
+) -> str:
+    """
+    For a given `NativeFunction`, find out the corresponding backend and dispatch to it. If more than one
+    backends exsit, fallback to static dispatch by determining dispatch key from inputs.
+    Arguments:
+        sig: A CppSignature or DispatcherSignature for this native function we want to use.
+        f: NativeFunction to generate static dispatch.
+        backend_indices: All available backends.
+    Return:
+        C++ code to call backend-specific functions, e.g., "return at::cpu::add(self, other, scale);"
+    """
+    if len(backend_indices) == 0 or f.manual_kernel_registration:
+        return ""
+
+    keys = [
+        b
+        for b in backend_indices
+        if b.has_kernel(f)
+        or (
+            f.structured_delegate is not None
+            and b.dispatch_key in STRUCTURED_DISPATCH_KEYS
+        )
+    ]
+    if len(keys) == 1:
+        return generate_static_dispatch_backend_call(sig, f, keys[0])
+    elif len(keys) == 0:
+        return generate_static_dispatch_fallback_call(sig, f, backend_indices)
+
+    native_tensor_args = [
+        a.name
+        for a in sig.arguments()
+        if isinstance(a.argument, SelfArgument)
+        or isinstance(a.argument, Argument)
+        and a.argument.type.is_tensor_like()
+    ]
+    tensor_args = ", ".join(native_tensor_args)
+    tensor_opts = f.func.arguments.tensor_options
+
+    stmts = []
+    subexprs: List[str] = []
+    if tensor_opts is not None:
+        subexprs.append(
+            "DispatchKeySet(c10::computeDispatchKey(dtype, layout, device))"
+        )
+    if tensor_args != "":
+        subexprs.append(f"c10::detail::multi_dispatch_key_set({tensor_args})")
+    stmts.append(f"""DispatchKeySet _dk_set = {' | '.join(subexprs)};""")
+    stmts.append("DispatchKey _dk = c10::highestPriorityBackendTypeId(_dk_set);")
+
+    dispatch_code = []
+    for index in keys:
+        dispatch_code.append(f"""case DispatchKey::{index.dispatch_key}:""")
+        dispatch_code.append(
+            f"""\t{generate_static_dispatch_backend_call(sig, f, index)};"""
+        )
+
+    fallback = generate_static_dispatch_fallback_call(sig, f, backend_indices)
+    connector = "\n\t\t"
+
+    return f"""
+    {connector.join(stmts)}
+    switch (_dk) {{
+        {connector.join(dispatch_code)}
+        default:
+            {fallback}
+    }}
+    """
+
+
+# Generates RegisterSchema.cpp.  Depending on the selector, either
+# all schemas are registered, or only some are (in the case of
+# selective build)
+@dataclass(frozen=True)
+class RegisterSchema:
+    selector: SelectiveBuilder
+    known_tags: Dict[str, int] = field(default_factory=dict)
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        if not self.selector.is_native_function_selected(f):
+            return None
+        tags = "{" + ", ".join(f"at::Tag::{tag}" for tag in sorted(f.tags)) + "}"
+        if tags == "{}":
+            return f"m.def({cpp_string(str(f.func))}, {{}});\n"
+        maybe_tags = ""
+        if tags not in self.known_tags:
+            idx = len(self.known_tags)
+            self.known_tags[tags] = idx
+            maybe_tags = f"const std::vector<at::Tag> tags_{idx} = {tags};\n"
+        return f"{maybe_tags}m.def({cpp_string(str(f.func))}, tags_{self.known_tags[tags]});\n"
+
+
+# Generates Operators.h and Operators.cpp.
+# These provide macros that, given an operator and overload name, allow users
+# to access an "un-overloaded" function version of the operator. This
+# is useful for extension writers who want to (1) want to decltype the operator
+# and (2) don't want to worry about method-only operators.
+@dataclass(frozen=True)
+class ComputeOperators:
+    target: Literal[Target.DECLARATION, Target.DEFINITION]
+    static_dispatch_backend_indices: List[BackendIndex]
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> str:
+        sig = DispatcherSignature.from_schema(f.func)
+        name = f.func.name.unambiguous_name()
+
+        if self.target is Target.DECLARATION:
+            # Note [The ATen Operators API]
+            # The ATen Operators API lives in the at::_ops namespace, and contains compile-time
+            # metadata about each operator + entry points into the Dispatcher.
+            # The C++ function, method, and redispatch API's are all implemented as wrappers
+            # into various bits of the structs defined here.
+            #
+            # Important characteristics about the Operators API:
+            # (1) It follows the Dispatcher API.
+            #     This is kind of necessary to avoid overhead.
+            #     For example: if it followed the C++ API, then all of the faithful C++ factory functions
+            #     would need to wrap their arguments into TensorOptions only to unwrap them again.
+            # (2) Overload names are disambiguated.
+            #     This is helpful for pytorch extenders who would like to decltype() an aten operator,
+            #     that has overloads, e.g. decltype(at::_ops::mul_Tensor::call)
+            # (3) No argument defaulting is allowed.
+            #     This is more of an implementation detail to avoid #include cycles,
+            #     since TensorBody.h (which defines the Tensor class) needs to include this file.
+            # (4) manual_cpp_bindings and faithful names are not included in the API.
+            #     This applies to stuff like __dispatch__is_complex(), and add_outf().
+            #     These aren't "real aten ops", they're just additional functions provided by the C++ API.
+            #     They're implemented as wrappers in Functions.h that call into the actual operators
+            #     defined here, i.e. at::_ops::is_complex::call() and at::_ops::add_out::call().
+            #     This means that ATEN_OP(is_complex) will not fastpath, and will go through the dispatcher.
+            return f"""
+struct TORCH_API {name} {{
+  using schema = {sig.type()};
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::{f.func.name.name}")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "{f.func.name.overload_name}")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, {cpp_string(str(f.func))})
+  static {sig.defn(name="call", is_redispatching_fn=False)};
+  static {sig.defn(name="redispatch", is_redispatching_fn=True)};
+}};"""
+
+        elif self.target is Target.DEFINITION:
+            defns = f"""
+STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, name, "aten::{f.func.name.name}")
+STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, overload_name, "{f.func.name.overload_name}")
+STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, schema_str, {cpp_string(str(f.func))})
+
+// aten::{f.func}
+static C10_NOINLINE c10::TypedOperatorHandle<{name}::schema> create_{name}_typed_handle() {{
+  return c10::Dispatcher::singleton()
+      .findSchemaOrThrow({name}::name, {name}::overload_name)
+      .typed<{name}::schema>();
+}}
+"""
+            for is_redispatching_fn in [False, True]:
+                if is_redispatching_fn:
+                    dispatcher_exprs_str = ", ".join(
+                        ["dispatchKeySet"] + [a.name for a in sig.arguments()]
+                    )
+                    method_base = "redispatch"
+                else:
+                    dispatcher_exprs_str = ", ".join([a.name for a in sig.arguments()])
+                    method_base = "call"
+
+                dispatcher_call = method_base
+                method_name = f"{name}::{method_base}"
+
+                fn_body = f"""
+    static auto op = create_{name}_typed_handle();
+    return op.{dispatcher_call}({dispatcher_exprs_str});"""
+
+                if (
+                    not is_redispatching_fn
+                    and len(self.static_dispatch_backend_indices) > 0
+                ):
+                    # call() should go through static dispatch
+                    fn_body = static_dispatch(
+                        sig, f, backend_indices=self.static_dispatch_backend_indices
+                    )
+                defns += f"""
+// aten::{f.func}
+{sig.defn(name=method_name, is_redispatching_fn=is_redispatching_fn)} {{
+    {fn_body}
+}}
+"""
+            return defns
+        else:
+            assert_never(self.target)
+
+
+# Generates Functions.h, which provides the functional public C++ API,
+# and the scaffolding to call into the dispatcher from these functions.
+@dataclass(frozen=True)
+class ComputeFunction:
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        sig_group = CppSignatureGroup.from_native_function(
+            f, method=False, fallback_binding=f.manual_cpp_binding
+        )
+        has_symint = f.func.has_symint()
+
+        result = ""
+        for sig in sig_group.signatures():
+            # See Note [The ATen Operators API]
+            target_sig = DispatcherSignature.from_schema(f.func)
+            exprs = translate(sig.arguments(), target_sig.arguments())
+            exprs_str = ", ".join([e.expr for e in exprs])
+
+            if sig.symint:
+                intlike_t = "c10::SymInt"
+            else:
+                intlike_t = "int64_t"
+
+            if Variant.function in f.variants:
+                result += f"""
+// aten::{f.func}
+inline {sig.decl()} {{
+    return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str});
+}}"""
+
+            # The template function can be used from template situations
+            # where you want to switch between the symint or not version
+            # depending on a template argument
+            #
+            # NB: we ALWAYS generate this even for methods.  But we put it in
+            # this header so it can take advantage of per-op headers
+            if has_symint:
+                result += f"""
+namespace symint {{
+  template <typename T, typename = std::enable_if_t<std::is_same<T, {intlike_t}>::value>>
+  {sig.decl(suppress_symint_suffix=True)} {{
+    return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str});
+  }}
+}}
+"""
+        return result
+
+
+# Generates TensorBody.h. This file provides the object-oriented (method-based)
+# public C++ API, and the scaffolding to call into the dispatcher from these functions.
+@dataclass(frozen=True)
+class ComputeTensorMethod:
+    target: Literal[Target.DECLARATION, Target.DEFINITION]
+    static_dispatch_backend_indices: List[BackendIndex]
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        if Variant.method not in f.variants:
+            return None
+
+        assert not f.func.is_out_fn()
+        assert f.func.arguments.self_arg is not None
+
+        sig_group = CppSignatureGroup.from_native_function(
+            f, method=True, fallback_binding=f.manual_cpp_binding
+        )
+
+        if self.target is Target.DECLARATION:
+            result = ""
+            for sig in sig_group.signatures():
+                result += f"{sig.decl()} const;\n"
+            return result
+
+        if self.target is not Target.DEFINITION:
+            assert_never(self.target)
+
+        result = ""
+
+        for sig in sig_group.signatures():
+            target_sig = DispatcherSignature.from_schema(f.func)
+            exprs = translate(sig.arguments(), target_sig.arguments(), method=True)
+            exprs_str = ", ".join([e.expr for e in exprs])
+
+            result += f"""
+// aten::{f.func}
+inline {sig.defn(prefix="Tensor::")} const {{
+    return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str});
+}}
+"""
+
+        return result
+
+
+# Generates RedispatchFunctions.h.
+# This is similar to the C++ API defined in Functions.h, but provides access
+# to the dispatcher's redispatch API.
+@dataclass(frozen=True)
+class ComputeRedispatchFunction:
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        # We unconditionally generate function variants of the redispatch API.
+        # This is mainly because we can namespace functions separately, but not methods,
+        sig_group = CppSignatureGroup.from_native_function(
+            f, method=False, fallback_binding=f.manual_cpp_binding
+        )
+
+        result = ""
+        for sig in sig_group.signatures():
+            target_sig = DispatcherSignature.from_schema(f.func)
+            exprs = translate(sig.arguments(), target_sig.arguments())
+            exprs_str = ", ".join(["dispatchKeySet"] + [a.expr for a in exprs])
+
+            result += f"""
+// aten::{f.func}
+inline {sig.decl(is_redispatching_fn=True)} {{
+    return at::_ops::{f.func.name.unambiguous_name()}::redispatch({exprs_str});
+}}
+"""
+
+        return result
+
+
+# Generates ATenOpList.cpp, a runtime accessible list of all aten
+# operators.
+# TODO: This was historically used to help some JIT interop code
+# figure out whether or not to treat aten namespace'd operators
+# one way or another, we should reevaluate if this is actually needed.
+@with_native_function
+def compute_aten_op(f: NativeFunction) -> str:
+    return f'{{"aten::{f.func.name.name}", "{f.func.name.overload_name}"}},'
+
+
+# Generates MetaFunctions.h
+def compute_meta_function_declaration(g: NativeFunctionsGroup) -> Optional[str]:
+    if not g.structured:
+        return None
+    with native_function_manager(g.out):
+        name = meta.name(g)
+        args = structured.meta_arguments(g)
+        args_str = ", ".join(a.decl() for a in args)
+        parent_class = g.out.structured_inherits
+        if parent_class is None:
+            parent_class = "at::impl::MetaBase"
+        meta_return = "void"
+        precomputed = g.out.precomputed if g.structured else None
+
+        if precomputed:
+            # Generate the template declaration with one bool parameter for each
+            # precomputed element. Each parameter is true if the corresponding (in
+            # terms of position) precomputed element has been set.
+            precomputed_values = [*precomputed.replace.values(), precomputed.add]
+            precomputed_elements = [
+                elem for replace_list in precomputed_values for elem in replace_list
+            ]
+            precomputed_template_parameters = [
+                elem.name.upper() for elem in precomputed_elements
+            ]
+            precomputed_template_params_str = ", ".join(
+                f"bool {param} = false" for param in precomputed_template_parameters
+            )
+            precompute_template_decl = f"template <{precomputed_template_params_str}>"
+
+            # Generate a string containing declarations of all precomputed elements.
+            precomputed_elements_with_cpp_types = [
+                structured.argument_type(elem, binds=elem.name)
+                for elem in precomputed_elements
+            ]
+
+            precomputed_elements_decl = ";\n".join(
+                f"{elem.cpp_type(strip_ref=True)} {elem.name}"
+                for elem in precomputed_elements_with_cpp_types
+            )
+
+            # Generate "setter" methods for each precomputed element. Each method will return
+            # a new instance of precompute_out with the template parameter that corresponds to
+            # the member set by the method to true (to indicate that it has been set).
+            setter_methods = []
+            for i, elem in enumerate(precomputed_elements):
+                # Generate the signature. The return type will be the same
+                # as the type of `this` but with the template parameter
+                # corresponding to the element set by this method set to true.
+                # The assert generated below will ensure that this template
+                # parameter is false on the type of `this`.
+                return_ty_templates = ", ".join(
+                    precomputed_template_parameters[:i]
+                    + ["true"]
+                    + precomputed_template_parameters[i + 1 :]
+                )
+                return_ty = f"precompute_out<{return_ty_templates}>"
+                elem_cpp_ty = precomputed_elements_with_cpp_types[i].cpp_type(
+                    strip_ref=True
+                )
+                signature = f"{return_ty} set_{elem.name}({elem_cpp_ty} value)"
+
+                # Generate an assert which checks that the
+                # template parameter corresponding to the precomputed
+                # element that is set by this method is false on the
+                # class corresponding to the object that `this` points to.
+                # This ensures that each element can be set only once.
+                assert_msg = f'"{precomputed_elements[i].name} already set"'
+                assert_stmt = f"static_assert({precomputed_template_parameters[i]} == false, {assert_msg});"
+
+                # Generate the new object construction block. All state
+                # except the element that this method sets is copied from the
+                # object that `this` points to. The value for the element that
+                # the method sets is taken from a method parameter.
+                construction_stmts = []
+                construction_stmts.append(f"{return_ty} ret;")
+
+                for j, elem in enumerate(precomputed_elements):
+                    if i == j:
+                        construction_stmts.append(f"ret.{elem.name} = value;")
+                    else:
+                        construction_stmts.append(
+                            f"ret.{elem.name} = this->{elem.name};"
+                        )
+
+                construction_stmts.append("return ret;")
+                construction_block = "\n".join(construction_stmts)
+
+                setter_methods.append(
+                    f"""
+                    {signature} {{
+                        {assert_stmt}
+                        {construction_block}
+                    }}
+                """
+                )
+            setter_methods_decl = "\n".join(setter_methods)
+
+            # Meta should return an instance of the struct containing the precomputed elements.
+            meta_return_template_params = ", ".join(
+                ["true"] * len(precomputed_template_parameters)
+            )
+            # This typedef (actually a using statement) is needed so that TORCH_META_FUNC can reuse the return
+            # type (which has a variable number of template parameters).
+            meta_return_typedef = f"using meta_return_ty = precompute_out <{meta_return_template_params}>;"
+            meta_return = "meta_return_ty"
+            precomputed_decl = f"""
+                {precompute_template_decl}
+                struct TORCH_API precompute_out {{
+                    {setter_methods_decl}
+                    {precomputed_elements_decl};
+            }};"""
+        else:
+            meta_return_typedef = ""
+            precomputed_decl = ""
+
+        return f"""\
+struct TORCH_API structured_{name} : public {parent_class} {{
+    {precomputed_decl}
+    {meta_return_typedef}
+    {meta_return} meta({args_str});
+}};
+"""
+
+
+def needs_backend_select(f: NativeFunction, selector: SelectiveBuilder) -> bool:
+    name = str(f.func.name.name)
+    if name.endswith("_like") or name.startswith("new_"):
+        return False
+    if f.func.arguments.tensor_options is None:
+        return False
+    return selector.is_native_function_selected(f)
+
+
+# Generates RegisterBackendSelect.cpp, a series of kernels which provide
+# specialized computation of dispatch key for operator signatures which cannot
+# be easily done automatically using templating.
+@dataclass(frozen=True)
+class ComputeBackendSelect:
+    target: Literal[Target.DEFINITION, Target.REGISTRATION]
+
+    # Selector object to determine which operators to generate
+    # registration code for.
+    selector: SelectiveBuilder
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        if not needs_backend_select(f, self.selector):
+            return None
+
+        name = native.name(f.func)
+        # BackendSelect can go to Meta, so it must preserve symints
+        native_sig = NativeSignature(f.func, symint=True)
+
+        native_tensor_args = [
+            a
+            for a in native_sig.arguments()
+            if isinstance(a.argument, Argument) and a.argument.type.is_tensor_like()
+        ]
+
+        dispatcher_sig = DispatcherSignature.from_schema(f.func)
+
+        sig: Union[NativeSignature, DispatcherSignature]
+        sig = dispatcher_sig
+        dispatcher_exprs = dispatcher_sig.exprs()
+        dispatch_key = "c10::computeDispatchKey(dtype, layout, device)"
+
+        if self.target is Target.DEFINITION:
+            # I don't think there's actually a good reason to generate
+            # these two cases differently
+            # The first case could probably be improved though- it calls computeDispatchKeySet(),
+            # which looks at TLS dispatch keys- there should not be any by the time we reach backend select.
+            if native_tensor_args:
+                assert f.func.arguments.has_tensor_arg()
+                tensor_args = ", ".join(a.name for a in native_tensor_args)
+                compute_dk = f"""\
+DispatchKeySet _dk_set = c10::DispatchKeySet({dispatch_key}) | c10::detail::multi_dispatch_key_set({tensor_args});
+DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect);
+DispatchKeySet _dk = c10::impl::computeDispatchKeySet(_dk_set, _dk_mask);"""
+            else:
+                assert not f.func.arguments.has_tensor_arg()
+                compute_dk = (
+                    f"DispatchKeySet _dk = c10::DispatchKeySet({dispatch_key});"
+                )
+            return f"""\
+// aten::{f.func}
+C10_ALWAYS_INLINE
+{sig.defn(name)} {{
+  {compute_dk}
+  return at::_ops::{f.func.name.unambiguous_name()}::redispatch(
+      _dk, {', '.join(a.expr for a in dispatcher_exprs)});
+}}
+"""
+        elif self.target is Target.REGISTRATION:
+            return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));"""
+        else:
+            assert_never(self.target)
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                       YAML CODE GENERATION
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def format_yaml(data: object) -> str:
+    # Ignore alias in Dumper
+    YamlDumper.ignore_aliases = lambda self, data: True  # type: ignore[assignment]
+
+    # Support serializing OrderedDict
+    def dict_representer(dumper: Any, data: Any) -> Any:
+        return dumper.represent_dict(data.items())
+
+    YamlDumper.add_representer(OrderedDict, dict_representer)  # type: ignore[no-untyped-call]
+    # Some yaml parsers (e.g. Haskell's) don't understand line breaks.
+    # width=1e9 turns off optional line breaks and improves
+    # the portability of the outputted yaml.
+    return yaml.dump(data, default_flow_style=False, Dumper=YamlDumper, width=1e9)  # type: ignore[no-any-return, call-overload]
+
+
+# For some reason, some defaults we write to YAML are written as native
+# YAML objects, rather than doing them uniformly as strings.  This
+# function detects those cases and converts them into native Python
+# objects.
+def pythonify_default(s: str) -> object:
+    if s == "true":
+        return True
+    elif s == "false":
+        return False
+
+    try:
+        return int(s)
+    except ValueError:
+        try:
+            return float(s)
+        except ValueError:
+            return s
+
+
+# What is a dynamic type?  Over time, the semantic meaning of
+# dynamic type has degraded to meaninglessness (in the old days,
+# it captured dtype-ness of types, but that has gone away with
+# the removal of TH).  These days, it's mostly the same thing as
+# the C++ API argument type, except that Tensor and Tensor?
+# arguments simply present as Tensor.
+#
+# TODO: Get rid of dynamic_type, after getting tools/autograd
+# to use the new codegen framework
+def dynamic_type(t: Type) -> str:
+    if isinstance(t, OptionalType):
+        return dynamic_type(t.elem)
+    # Note we don't use t.is_tensor_like() here because it would
+    # also include Tensor[]
+    if str(t) == "Tensor":
+        return "at::Tensor"
+    # This is a legacy concept, so never report SymInt
+    return cpp.argumenttype_type(
+        t, mutable=False, binds="__placeholder__", symint=False
+    ).cpp_type()
+
+
+def compute_method_of_yaml(variants: Set[Variant]) -> List[str]:
+    # This is written out explicitly to ensure that Tensor and
+    # namespace are put into the list in the right order
+    method_of = ["Type"]
+    if Variant.method in variants:
+        method_of.append("Tensor")
+    if Variant.function in variants:
+        method_of.append("namespace")
+    return method_of
+
+
+def compute_returns_yaml(
+    f: NativeFunction,
+) -> Tuple[List[Dict[str, str]], Dict[str, str]]:
+    # Note [name and field_name]
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # To understand name_to_field_name, we must first talk about this
+    # schema:
+    #
+    #   lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
+    #
+    # There is something very odd about this schema: it is an out
+    # variant of the function (that is to say, it will convert into
+    # at::lstsq_out() in the C++ API), but the names of the output
+    # return arguments don't match the keyword argument names of
+    # the inputs.  It TURNS OUT that in this situation, the historical
+    # Declarations.yaml we want to output is this (abbreviated to
+    # only show relevant fields):
+    #
+    #   arguments:
+    #     ...
+    #   - field_name: solution
+    #     name: X
+    #   - field_name: QR
+    #     name: qr
+    #     ...
+    #
+    #   returns:
+    #   - field_name: solution
+    #     name: X
+    #   - field_name: QR
+    #     name: qr
+    #
+    # The name of the return fields is stored in 'field_name', and the
+    # name of the arguments is stored in 'name'.  So when we process
+    # arguments, we need a way to get at the corresponding return.  At
+    # the moment, this is most conveniently done by constructing a
+    # mapping from name (the argument concept) to field_name (the
+    # return concept) while processing return arguments, since we don't
+    # directly maintain this correspondence in the modeling of function
+    # schema itself.
+    #
+    # See also https://github.com/pytorch/pytorch/issues/43114
+    name_to_field_name: Dict[str, str] = {}
+
+    # Compute the returns field of the YAML entry
+    names = cpp.return_names(f)
+    returns = []
+    for i, (r, name) in enumerate(zip(f.func.returns, names)):
+        ret = {
+            "dynamic_type": dynamic_type(r.type),
+            "name": name,
+            # legacy, report ints
+            "type": cpp.return_type(r, symint=False).cpp_type(),
+        }
+
+        if r.name:
+            # See Note [name and field_name]
+            ret["field_name"] = r.name
+            if f.func.is_out_fn():
+                name_to_field_name[f.func.arguments.out[i].name] = r.name
+
+        returns.append(ret)
+
+    return returns, name_to_field_name
+
+
+# arguments in yaml roughly corresponds to the public C++ API
+def compute_cpp_argument_yaml(
+    cpp_a: Binding,
+    *,
+    schema_order: bool,
+    kwarg_only_set: Set[str],
+    out_arg_set: Set[str],
+    name_to_field_name: Dict[str, str],
+) -> object:
+    if isinstance(cpp_a.argument, TensorOptionsArguments):
+        arg: Dict[str, object] = {
+            "annotation": None,
+            "dynamic_type": "at::TensorOptions",
+            "is_nullable": False,
+            "name": cpp_a.name,
+            "type": cpp_a.type,
+            "kwarg_only": True,
+        }
+        if cpp_a.default is not None:
+            arg["default"] = cpp_a.default
+        return arg
+    elif isinstance(cpp_a.argument, SelfArgument):
+        raise AssertionError()
+    elif isinstance(cpp_a.argument, Argument):
+        return compute_argument_yaml(
+            cpp_a.argument,
+            schema_order=schema_order,
+            kwarg_only_set=kwarg_only_set,
+            out_arg_set=out_arg_set,
+            name_to_field_name=name_to_field_name,
+        )
+
+
+def compute_argument_yaml(
+    a: Argument,
+    *,
+    schema_order: bool,
+    kwarg_only_set: Set[str],
+    out_arg_set: Set[str],
+    name_to_field_name: Dict[str, str],
+) -> object:
+    arg: Dict[str, object] = {
+        "annotation": str(a.annotation) if a.annotation else None,
+        "dynamic_type": dynamic_type(a.type),
+        "is_nullable": a.type.is_nullable(),
+        "name": a.name,
+        # legacy, report ints
+        "type": cpp.argument_type(a, binds="__placeholder__", symint=False).cpp_type(),
+    }
+    if a.default is not None:
+        arg["default"] = pythonify_default(
+            cpp.default_expr(a.default, a.type, symint=False)
+        )
+    if a.name in kwarg_only_set:
+        arg["kwarg_only"] = True
+    if a.name in out_arg_set:
+        arg["output"] = True
+        arg["allocate"] = True
+        # See Note [name and field_name]
+        if a.name in name_to_field_name:
+            arg["field_name"] = name_to_field_name[a.name]
+    # Historically, booleans don't get their size recorded, because it
+    # is already built into the cpp type (e.g., std::array<bool, 4>)
+    l = a.type.is_list_like()
+    if l is not None and l.size is not None and str(l.elem) != "bool":
+        arg["size"] = l.size
+    return arg
+
+
+@with_native_function
+def compute_declaration_yaml(f: NativeFunction) -> object:
+    returns, name_to_field_name = compute_returns_yaml(f)
+
+    # These sets are used to conveniently test if an argument is a
+    # kwarg-only or out argument
+    kwarg_only_set = {a.name for a in f.func.arguments.flat_kwarg_only}
+    out_arg_set = {a.name for a in f.func.arguments.out}
+
+    sig_group = CppSignatureGroup.from_native_function(
+        f, method=False, fallback_binding=False
+    )
+    cpp_args = sig_group.signature.arguments()
+    arguments = [
+        compute_cpp_argument_yaml(
+            cpp_a,
+            schema_order=False,
+            kwarg_only_set=kwarg_only_set,
+            out_arg_set=out_arg_set,
+            name_to_field_name=name_to_field_name,
+        )
+        for cpp_a in cpp_args
+    ]
+
+    schema_order_jit_arguments = list(f.func.schema_order_arguments())
+
+    schema_order_arguments = [
+        compute_argument_yaml(
+            a,
+            schema_order=True,
+            kwarg_only_set=kwarg_only_set,
+            out_arg_set=out_arg_set,
+            name_to_field_name=name_to_field_name,
+        )
+        for a in schema_order_jit_arguments
+    ]
+
+    cpp_schema_order_types = [
+        # NB: method here doesn't matter
+        r.type
+        for a in schema_order_jit_arguments
+        for r in cpp.argument(
+            a,
+            method=False,
+            cpp_no_default_args=set(),
+            faithful=False,
+            symint=False,
+            has_tensor_options=False,
+        )
+    ]
+
+    # legacy, report ints
+    cpp_returns = cpp.returns_type(f.func.returns, symint=False).cpp_type()
+    schema_order_cpp_signature = f"{cpp_returns} ({', '.join(cpp_schema_order_types)})"
+
+    is_factory_method = (
+        any(isinstance(a.argument, TensorOptionsArguments) for a in cpp_args)
+        and Variant.method not in f.variants
+    )
+
+    return OrderedDict(
+        [
+            ("name", cpp.name(f.func)),
+            ("operator_name", str(f.func.name.name)),
+            ("overload_name", str(f.func.name.overload_name)),
+            ("manual_kernel_registration", f.manual_kernel_registration),
+            (
+                "category_override",
+                f.category_override if f.category_override is not None else "",
+            ),
+            ("schema_string", f"aten::{f.func}"),
+            ("arguments", arguments),
+            ("schema_order_cpp_signature", schema_order_cpp_signature),
+            ("schema_order_arguments", schema_order_arguments),
+            ("method_of", compute_method_of_yaml(f.variants)),
+            ("mode", "native"),
+            ("python_module", "" if f.python_module is None else f.python_module),
+            ("returns", returns),
+            ("inplace", f.func.name.name.inplace),
+            ("is_factory_method", is_factory_method),
+            ("abstract", f.is_abstract),
+            ("device_guard", f.device_guard),
+            ("with_gil", False),
+            ("deprecated", False),
+            ("has_math_kernel", f.has_composite_implicit_autograd_kernel),
+        ]
+    )
+
+
+# See Note [Auto generated composite kernels]
+def has_autogenerated_composite_kernel(f: NativeFunction) -> bool:
+    return (f.structured or f.structured_delegate is not None) and (
+        f.func.kind() == SchemaKind.functional or f.func.kind() == SchemaKind.inplace
+    )
+
+
+@with_native_function_and_indices
+def compute_registration_declarations(
+    f: NativeFunction, backend_indices: Dict[DispatchKey, BackendIndex]
+) -> str:
+    name = dispatcher.name(f.func)
+    returns_type = dispatcher.returns_type(
+        f.func.returns
+    ).cpp_type_registration_declarations()
+    args = dispatcher.arguments(f.func)
+    args_str = ", ".join(a.no_default().decl_registration_declarations() for a in args)
+    comment_data: Dict[str, str] = {
+        "schema": f"aten::{f.func}",
+        # TODO: What exactly is the semantics of the 'dispatch' field?
+        "dispatch": str(
+            {k for k, v in backend_indices.items() if v.has_kernel(f)}
+            != {DispatchKey.CompositeImplicitAutograd}
+            and {k for k, v in backend_indices.items() if v.has_kernel(f)}
+            != {
+                DispatchKey.CompositeImplicitAutograd,
+                DispatchKey.CompositeImplicitAutogradNestedTensor,
+            }
+        ),
+        "default": str(f.has_composite_kernel or has_autogenerated_composite_kernel(f)),
+    }
+    return f"""{returns_type} {name}({args_str}); // {json.dumps(comment_data)}
+"""
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           RUN IT ALL
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def get_custom_build_selector(
+    provided_op_registration_allowlist: Optional[List[str]],
+    op_selection_yaml_path: Optional[str],
+) -> SelectiveBuilder:
+    assert not (
+        provided_op_registration_allowlist is not None
+        and op_selection_yaml_path is not None
+    ), (
+        "Both provided_op_registration_allowlist and "
+        + "op_selection_yaml_path can NOT be provided at the "
+        + "same time."
+    )
+
+    op_registration_allowlist: Optional[Set[str]] = None
+    if provided_op_registration_allowlist is not None:
+        op_registration_allowlist = set(provided_op_registration_allowlist)
+
+    if op_registration_allowlist is not None:
+        selector = SelectiveBuilder.from_legacy_op_registration_allow_list(
+            op_registration_allowlist,
+            True,
+            False,
+        )
+    elif op_selection_yaml_path is not None:
+        selector = SelectiveBuilder.from_yaml_path(op_selection_yaml_path)
+    else:
+        selector = SelectiveBuilder.get_nop_selector()
+
+    return selector
+
+
+def get_grouped_by_view_native_functions(
+    native_functions: Sequence[NativeFunction],
+) -> Sequence[Union[NativeFunction, NativeFunctionsViewGroup]]:
+    def maybe_create_view_group(
+        d: Dict[Union[ViewSchemaKind, SchemaKind], NativeFunction]
+    ) -> List[Union[NativeFunction, NativeFunctionsViewGroup]]:
+        funcs: List[Union[NativeFunction, NativeFunctionsViewGroup]] = []
+        if ViewSchemaKind.aliasing in d:
+            view = d.pop(ViewSchemaKind.aliasing)
+            view_inplace = d.pop(ViewSchemaKind.aliasing_inplace, None)
+            view_copy = d.pop(SchemaKind.functional, None)
+
+            funcs.append(
+                NativeFunctionsViewGroup(
+                    view=view,
+                    view_copy=view_copy,
+                    view_inplace=view_inplace,
+                )
+            )
+        # Take the remaining functions that weren't part of the view group
+        # and emit them separately
+        funcs.extend(d.values())
+        return funcs
+
+    grouped_by_views: Dict[
+        FunctionSchema, Dict[Union[SchemaKind, ViewSchemaKind], NativeFunction]
+    ] = defaultdict(dict)
+    for f in native_functions:
+        schema = f.func.view_signature()
+        view_kind: ViewSchemaKind = f.view_schema_kind
+        # We need to group up ops relevant to the same "view", consisting of:
+        # view op (ViewSchemaKind.aliasing)
+        # view_inplace op (ViewSchemaKind.aliasing_inplace)
+        # view_copy op (SchemaKind.functional)
+        if view_kind == ViewSchemaKind.non_aliasing:
+            kind = f.func.kind()
+            assert kind not in grouped_by_views[schema]
+            grouped_by_views[schema][kind] = f
+        else:
+            assert view_kind not in grouped_by_views[schema]
+            grouped_by_views[schema][view_kind] = f
+
+    return list(concatMap(maybe_create_view_group, grouped_by_views.values()))
+
+
+def get_grouped_native_functions(
+    native_functions: Sequence[NativeFunction],
+) -> Sequence[Union[NativeFunction, NativeFunctionsGroup]]:
+    def flatten_pre_group(
+        d: Dict[SchemaKind, NativeFunction]
+    ) -> Sequence[Union[NativeFunction, NativeFunctionsGroup]]:
+        r = NativeFunctionsGroup.from_dict(d)
+        if r is None:
+            # Invariant: any NativeFunctions that are code-generated
+            # should have been grouped into NativeFunctionsGroup objects
+            assert not any("generated" in f.tags for f in d.values())
+            return list(d.values())
+        else:
+            return [r]
+
+    # TODO: how come ValuesView isn't a Sequence lol
+    pre_grouped_native_functions = pre_group_native_functions(native_functions)
+    return list(
+        concatMap(flatten_pre_group, list(pre_grouped_native_functions.values()))
+    )
+
+
+def get_ns_grouped_kernels(
+    *,
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    native_function_decl_gen: Callable[
+        [Union[NativeFunctionsGroup, NativeFunction], BackendIndex], List[str]
+    ] = dest.compute_native_function_declaration,
+) -> Dict[str, List[str]]:
+    ns_grouped_kernels: Dict[str, List[str]] = defaultdict(list)
+    for f in grouped_native_functions:
+        native_function_namespaces = set()
+        dispatch_keys = set()
+        for dispatch_key, backend_idx in backend_indices.items():
+            backend_metadata = backend_idx.get_kernel(f)
+            if backend_metadata:
+                namespace = backend_metadata.cpp_namespace
+                dispatch_keys.add(dispatch_key)
+                native_function_namespaces.add(namespace)
+            else:
+                namespace = DEFAULT_KERNEL_NAMESPACE
+            assert (
+                len(native_function_namespaces) <= 1
+            ), f"Codegen only supports one namespace per operator, got {native_function_namespaces} from {dispatch_keys}"
+            ns_grouped_kernels[namespace].extend(
+                native_function_decl_gen(f, backend_idx)
+            )
+    return ns_grouped_kernels
+
+
+def get_native_function_declarations_from_ns_grouped_kernels(
+    *,
+    ns_grouped_kernels: Dict[str, List[str]],
+) -> List[str]:
+    declarations: List[str] = []
+    newline = "\n"
+    for namespace, kernels in ns_grouped_kernels.items():
+        ns_helper = NamespaceHelper(
+            namespace_str=namespace,
+            entity_name="",
+            max_level=4,
+        )
+        # Convert to a set first to remove duplicate kernel names. Backends are
+        # allowed to repeat kernel names; only generate the declaration once!
+        ordered_kernels = list(OrderedDict.fromkeys(kernels))
+        declarations.extend(
+            f"""
+{ns_helper.prologue}
+{newline.join(ordered_kernels)}
+{ns_helper.epilogue}
+        """.split(
+                newline
+            )
+        )
+    return declarations
+
+
+# Return native function declarations grouped by their namespaces.
+def get_native_function_declarations(
+    *,
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    native_function_decl_gen: Callable[
+        [Union[NativeFunctionsGroup, NativeFunction], BackendIndex], List[str]
+    ] = dest.compute_native_function_declaration,
+) -> List[str]:
+    """
+    Generate kernel declarations, in `NativeFunction(s).h`.
+    :param grouped_native_functions: a sequence of `NativeFunction` or `NativeFunctionGroup`.
+    :param backend_indices: kernel collections grouped by dispatch key.
+    :param native_function_decl_gen: callable to generate kernel declaration for each `NativeFunction`.
+    :return: a list of string, from the string with all declarations, grouped by namespaces, split by newline.
+    """
+
+    ns_grouped_kernels = get_ns_grouped_kernels(
+        grouped_native_functions=grouped_native_functions,
+        backend_indices=backend_indices,
+        native_function_decl_gen=native_function_decl_gen,
+    )
+    return get_native_function_declarations_from_ns_grouped_kernels(
+        ns_grouped_kernels=ns_grouped_kernels
+    )
+
+
+def get_kernel_namespace(
+    *, f: Union[NativeFunction, NativeFunctionsGroup], backend_idx: BackendIndex
+) -> str:
+    backend_metadata = backend_idx.get_kernel(f)
+    assert not backend_metadata or "::native" in backend_metadata.cpp_namespace, (
+        f"The kernel for function {f.func.name if isinstance(f, NativeFunction) else f.functional.func.name} "
+        f"with dispatch key {backend_idx.dispatch_key}"
+        f" has a namespace {backend_metadata.cpp_namespace} and it's not ending with '::native'."
+    )
+    return (
+        backend_metadata.cpp_namespace if backend_metadata else DEFAULT_KERNEL_NAMESPACE
+    )
+
+
+# Return native function definitions grouped by dispatch key and custom namespace.
+# Used in RegisterDispatchKey.cpp and etc.
+def get_native_function_definitions(
+    *,
+    fm: FileManager,
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    dispatch_key: DispatchKey,
+    backend_idx: BackendIndex,
+    selector: SelectiveBuilder,
+    rocm: bool,
+    symint: bool,
+    skip_dispatcher_op_registration: bool,
+    gen_dispatch_helpers: bool,
+) -> List[str]:
+    definitions: List[str] = []
+    ns_definitions: Dict[str, List[str]] = defaultdict(list)
+    anonymous_definitions: Dict[str, List[str]] = defaultdict(list)
+    registrations: Dict[str, Dict[str, List[str]]] = defaultdict(dict)
+    newline = "\n"
+    ns_gen = dest.RegisterDispatchKey(
+        backend_idx,
+        Target.NAMESPACED_DEFINITION,
+        selector,
+        rocm=rocm,
+        symint=symint,
+        class_method_name=None,
+        skip_dispatcher_op_registration=skip_dispatcher_op_registration,
+    )
+    anonymous_gen = dest.RegisterDispatchKey(
+        backend_idx,
+        Target.ANONYMOUS_DEFINITION,
+        selector,
+        rocm=rocm,
+        symint=symint,
+        class_method_name=None,
+        skip_dispatcher_op_registration=skip_dispatcher_op_registration,
+    )
+    reg_gen = dest.RegisterDispatchKey(
+        backend_idx,
+        Target.REGISTRATION,
+        selector,
+        rocm=rocm,
+        symint=symint,
+        class_method_name=None,
+        skip_dispatcher_op_registration=skip_dispatcher_op_registration,
+    )
+    for f in grouped_native_functions:
+        kernel_namespace = get_kernel_namespace(f=f, backend_idx=backend_idx).replace(
+            "::native", ""
+        )
+
+        ns_definitions[kernel_namespace].extend(
+            ns_gen(f),
+        )
+        anonymous_definitions[kernel_namespace].extend(
+            anonymous_gen(f),
+        )
+        namespace = (
+            f.namespace if isinstance(f, NativeFunction) else f.functional.namespace
+        )
+        if namespace not in registrations[kernel_namespace]:
+            registrations[kernel_namespace] = defaultdict(list)
+        registrations[kernel_namespace][namespace].extend(
+            reg_gen(f),
+        )
+
+    for kernel_namespace in ns_definitions:
+        if len(ns_definitions[kernel_namespace]) == 0:
+            continue
+        ns_helper = NamespaceHelper(namespace_str=kernel_namespace)
+        registration_body = ""
+        for namespace in registrations[kernel_namespace]:
+            if not registrations[kernel_namespace][namespace]:
+                continue
+            registration_body += f"""
+TORCH_LIBRARY_IMPL({namespace}, {dispatch_key}, m) {{
+    {newline.join(registrations[kernel_namespace][namespace])}
+}};"""
+        definitions.extend(
+            fm.substitute_with_template(
+                "RegisterDispatchDefinitions.ini",
+                lambda: {
+                    "ns_prologue": ns_helper.prologue,
+                    "ns_epilogue": ns_helper.epilogue,
+                    "dispatch_helpers": dest.gen_registration_helpers(backend_idx)
+                    if gen_dispatch_helpers
+                    else [],
+                    "dispatch_anonymous_definitions": anonymous_definitions[
+                        kernel_namespace
+                    ],
+                    "static_init_dispatch_registrations": ""
+                    if skip_dispatcher_op_registration
+                    else registration_body,
+                    "deferred_dispatch_registrations": "",
+                    "dispatch_namespace": dispatch_key.lower(),
+                    "dispatch_namespaced_definitions": ns_definitions[kernel_namespace],
+                },
+            ).split(newline)
+        )
+
+    return definitions
+
+
+# Return native function declarations grouped by dispatch key and custom namespace.
+# Used in CPUFunctions_inl.h and etc.
+def get_namespaced_declaration(
+    *,
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    dispatch_key: DispatchKey,
+    backend_idx: BackendIndex,
+    selector: SelectiveBuilder,
+    rocm: bool,
+    symint: bool,
+) -> List[str]:
+    declarations: List[str] = []
+    ns_grouped_kernels: Dict[str, List[str]] = defaultdict(list)
+    newline = "\n"
+    func = dest.RegisterDispatchKey(
+        backend_idx,
+        Target.NAMESPACED_DECLARATION,
+        selector,
+        rocm=rocm,
+        class_method_name=None,
+        skip_dispatcher_op_registration=False,
+        symint=symint,
+    )
+    for f in grouped_native_functions:
+        namespace = get_kernel_namespace(f=f, backend_idx=backend_idx).replace(
+            "native", dispatch_key.lower()
+        )
+
+        ns_grouped_kernels[namespace].extend(
+            func(f),
+        )
+
+    for namespace, kernels in ns_grouped_kernels.items():
+        if len(kernels) == 0:
+            continue
+        ns_helper = NamespaceHelper(
+            namespace_str=namespace, entity_name="", max_level=3
+        )
+        ordered_kernels = list(OrderedDict.fromkeys(kernels))
+        declarations.extend(
+            f"""
+{ns_helper.prologue}
+{newline.join(ordered_kernels)}
+{ns_helper.epilogue}
+        """.split(
+                newline
+            )
+        )
+    return declarations
+
+
+# Return native function schema registration code for aten and other namespaces.
+def get_native_function_schema_registrations(
+    *,
+    native_functions: Sequence[NativeFunction],
+    schema_selector: SelectiveBuilder,
+) -> Tuple[List[str], str]:
+    ns_native_functions: Dict[str, List[NativeFunction]] = defaultdict(list)
+    for native_function in native_functions:
+        ns_native_functions[native_function.namespace].append(native_function)
+    schema_registrations = ""
+    aten_schema_registrations = []
+    custom_namespace = None
+    for namespace, funcs in ns_native_functions.items():
+        schema_registrations_body = list(
+            mapMaybe(RegisterSchema(schema_selector), funcs)
+        )
+        # NB: we have to separate aten namespace registration from other namespaces,
+        # because in the template we hardcoded an operator for ATen already.
+        if namespace == "aten":
+            aten_schema_registrations = schema_registrations_body
+        else:
+            custom_namespace = namespace
+            tab = "\t"
+            # if the namespace is predefined, we should use define a library fragment
+            # instead of a new library
+            torch_library_macro = (
+                "TORCH_LIBRARY_FRAGMENT"
+                if namespace in FRAGMENT_NAMESPACES
+                else "TORCH_LIBRARY"
+            )
+            schema_registrations += f"""
+{torch_library_macro}({custom_namespace}, m) {{
+  {tab.join(schema_registrations_body)}
+}};"""
+    return (aten_schema_registrations, schema_registrations)
+
+
+def gen_aggregated_headers(
+    *,
+    native_functions: Sequence[NativeFunction],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    structured_native_functions: Sequence[NativeFunctionsGroup],
+    static_dispatch_idx: List[BackendIndex],
+    selector: SelectiveBuilder,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    cpu_fm: FileManager,
+    cuda_fm: FileManager,
+    functions_keys: Set[DispatchKey],
+    dispatch_keys: Sequence[DispatchKey],
+    rocm: bool,
+) -> None:
+    # Buck doesn't support dynamic output files, so we aggregate all operator
+    # headers into a single file
+    cpu_fm.write(
+        "NativeMetaFunctions.h",
+        lambda: {
+            "NativeMetaFunctions_includes": [],
+            "NativeMetaFunctions_declarations": list(
+                mapMaybe(compute_meta_function_declaration, structured_native_functions)
+            ),
+        },
+    )
+    method_native_functions = [
+        fn for fn in native_functions if Variant.method in fn.variants
+    ]
+    non_method_native_functions = [
+        fn for fn in native_functions if fn not in method_native_functions
+    ]
+    cpu_fm.write(
+        "MethodOperators.h",
+        lambda: {
+            "MethodOperators_includes": [],
+            "MethodOperators_declarations": list(
+                mapMaybe(
+                    ComputeOperators(
+                        Target.DECLARATION,
+                        static_dispatch_backend_indices=static_dispatch_idx,
+                    ),
+                    method_native_functions,
+                )
+            ),
+        },
+    )
+    cpu_fm.write(
+        "Operators.h",
+        lambda: {
+            "Operators_includes": ["#include <ATen/MethodOperators.h>"],
+            "Operators_declarations": list(
+                mapMaybe(
+                    ComputeOperators(
+                        Target.DECLARATION,
+                        static_dispatch_backend_indices=static_dispatch_idx,
+                    ),
+                    non_method_native_functions,
+                )
+            ),
+        },
+    )
+    cpu_fm.write(
+        "Functions.h",
+        lambda: {
+            "static_dispatch_extra_headers": static_dispatch_extra_headers(
+                static_dispatch_idx
+            ),
+            "Functions_includes": ["#include <ATen/Operators.h>"],
+            "Functions_declarations": list(
+                mapMaybe(
+                    ComputeFunction(),
+                    native_functions,
+                )
+            ),
+        },
+    )
+    declarations = get_native_function_declarations(
+        grouped_native_functions=grouped_native_functions,
+        backend_indices=backend_indices,
+    )
+    cpu_fm.write(
+        "NativeFunctions.h",
+        lambda: {
+            "NativeFunctions_includes": ["#include <ATen/NativeMetaFunctions.h>"],
+            "NativeFunctions_declarations": declarations,
+        },
+    )
+
+    for dispatch_key in dispatch_keys:
+        fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
+        if dispatch_key in functions_keys:
+            inl_headers = f"#include <ATen/{dispatch_key}Functions_inl.h>"
+
+            fm.write_with_template(
+                f"{dispatch_key}Functions.h",
+                "DispatchKeyFunctions.h",
+                lambda: {
+                    "dispatch_key": str(dispatch_key),
+                    "inline_headers": inl_headers,
+                },
+            )
+            fm.write_with_template(
+                f"{dispatch_key}Functions_inl.h",
+                "DispatchKeyFunctions_inl.h",
+                lambda: {
+                    "DispatchKeyFunctions_inl_includes": [],
+                    "dispatch_namespace": dispatch_key.lower(),
+                    "dispatch_namespaced_declarations": get_namespaced_declaration(
+                        grouped_native_functions=grouped_native_functions,
+                        dispatch_key=dispatch_key,
+                        backend_idx=backend_indices[dispatch_key],
+                        selector=selector,
+                        rocm=rocm,
+                        symint=True,
+                    ),
+                },
+            )
+
+        del fm
+
+
+def gen_per_operator_headers(
+    *,
+    native_functions: Sequence[NativeFunction],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    static_dispatch_idx: List[BackendIndex],
+    selector: SelectiveBuilder,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    cpu_fm: FileManager,
+    cuda_fm: FileManager,
+    ops_fm: FileManager,
+    functions_keys: Set[DispatchKey],
+    dispatch_keys: Sequence[DispatchKey],
+    rocm: bool,
+) -> None:
+    # For CMake builds, split operator declarations into separate headers in
+    # the ATen/ops folder to split up header dependencies
+    functions_by_root_name: Dict[str, List[NativeFunction]] = defaultdict(list)
+    for fn in native_functions:
+        functions_by_root_name[fn.root_name].append(fn)
+
+    grouped_functions_by_root_name: Dict[
+        str, List[Union[NativeFunction, NativeFunctionsGroup]]
+    ] = defaultdict(list)
+    for group in grouped_native_functions:
+        name = group.root_name
+        grouped_functions_by_root_name[name].append(group)
+
+    for name, functions in functions_by_root_name.items():
+        ops_fm.write_with_template(
+            f"{name}_ops.h",
+            "Operator.h",
+            lambda: {
+                "declarations": list(
+                    mapMaybe(
+                        ComputeOperators(
+                            Target.DECLARATION,
+                            static_dispatch_backend_indices=static_dispatch_idx,
+                        ),
+                        functions,
+                    )
+                ),
+            },
+        )
+
+        ops_fm.write_with_template(
+            f"{name}.h",
+            "Function.h",
+            lambda: {
+                "static_dispatch_ops_headers": list(
+                    mapMaybe(
+                        lambda fn: static_dispatch_ops_header(
+                            fn, backend_index=static_dispatch_idx
+                        ),
+                        functions,
+                    )
+                ),
+                "operator_includes": f"#include <ATen/ops/{name}_ops.h>",
+                "function_definitions": list(
+                    mapMaybe(
+                        ComputeFunction(),
+                        functions,
+                    )
+                ),
+            },
+        )
+
+        grouped_functions = grouped_functions_by_root_name.get(name, [])
+        structured_functions = [
+            fn
+            for fn in grouped_functions
+            if isinstance(fn, NativeFunctionsGroup) and fn.structured
+        ]
+        is_structured = len(structured_functions) > 0
+
+        if is_structured:
+            ops_fm.write_with_template(
+                f"{name}_meta.h",
+                "NativeMetaFunction.h",
+                lambda: {
+                    "meta_function_declarations": list(
+                        mapMaybe(
+                            compute_meta_function_declaration, structured_functions
+                        )
+                    ),
+                },
+            )
+        declarations = get_native_function_declarations(
+            grouped_native_functions=grouped_functions,
+            backend_indices=backend_indices,
+            native_function_decl_gen=dest.compute_native_function_declaration,
+        )
+        ops_fm.write_with_template(
+            f"{name}_native.h",
+            "NativeFunction.h",
+            lambda: {
+                "extra_includes": (
+                    f"#include <ATen/ops/{name}_meta.h>" if is_structured else []
+                ),
+                "native_function_declarations": declarations,
+            },
+        )
+
+    for category, suffix in [
+        ("Functions", ""),
+        ("Operators", "_ops"),
+        ("NativeMetaFunctions", "_meta"),
+        ("NativeFunctions", "_native"),
+    ]:
+        cpu_fm.write(
+            f"{category}.h",
+            lambda: {
+                f"{category}_includes": [
+                    f"#include <ATen/ops/{name}{suffix}.h>"
+                    for name in sorted(functions_by_root_name.keys())
+                ],
+                f"{category}_declarations": [],
+            },
+        )
+
+    for dispatch_key in dispatch_keys:
+        if dispatch_key not in functions_keys:
+            continue
+
+        dispatch_namespace = dispatch_key.lower()
+        dispatch_names = []
+
+        for name, functions in functions_by_root_name.items():
+            grouped_functions = grouped_functions_by_root_name.get(name, [])
+            declarations = list(
+                concatMap(
+                    dest.RegisterDispatchKey(
+                        backend_indices[dispatch_key],
+                        Target.NAMESPACED_DECLARATION,
+                        selector,
+                        rocm=rocm,
+                        symint=True,
+                        class_method_name=None,
+                        skip_dispatcher_op_registration=False,
+                    ),
+                    grouped_functions,
+                )
+            )
+
+            if len(declarations) == 0:
+                continue
+
+            dispatch_names.append(name)
+            ops_fm.write_with_template(
+                f"{name}_{dispatch_namespace}_dispatch.h",
+                "DispatchKeyFunction.h",
+                lambda: {
+                    "dispatch_namespace": dispatch_namespace,
+                    "dispatch_namespaced_declarations": declarations,
+                },
+            )
+
+        fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
+        inl_headers = f"#include <ATen/{dispatch_key}Functions_inl.h>"
+
+        fm.write_with_template(
+            f"{dispatch_key}Functions.h",
+            "DispatchKeyFunctions.h",
+            lambda: {
+                "dispatch_key": str(dispatch_key),
+                "inline_headers": inl_headers,
+            },
+        )
+        fm.write_with_template(
+            f"{dispatch_key}Functions_inl.h",
+            "DispatchKeyFunctions_inl.h",
+            lambda: {
+                "dispatch_namespace": dispatch_namespace,
+                "DispatchKeyFunctions_inl_includes": [
+                    f"#include <ATen/ops/{name}_{dispatch_namespace}_dispatch.h>"
+                    for name in sorted(dispatch_names)
+                ],
+                "dispatch_namespaced_declarations": [],
+            },
+        )
+        del fm
+
+    cpu_fm.write(
+        "MethodOperators.h",
+        lambda: {
+            "MethodOperators_includes": sorted(
+                f"#include <ATen/ops/{name}_ops.h>"
+                for name, functions in functions_by_root_name.items()
+                if any(Variant.method in fn.variants for fn in functions)
+            ),
+            "MethodOperators_declarations": [],
+        },
+    )
+
+
+def gen_headers(
+    *,
+    native_functions: Sequence[NativeFunction],
+    valid_tags: Set[str],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    structured_native_functions: Sequence[NativeFunctionsGroup],
+    static_dispatch_idx: List[BackendIndex],
+    selector: SelectiveBuilder,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    core_fm: FileManager,
+    cpu_fm: FileManager,
+    cuda_fm: FileManager,
+    ops_fm: FileManager,
+    dispatch_keys: Sequence[DispatchKey],
+    functions_keys: Set[DispatchKey],
+    rocm: bool,
+    per_operator_headers: bool,
+) -> None:
+    if per_operator_headers:
+        gen_per_operator_headers(
+            native_functions=native_functions,
+            grouped_native_functions=grouped_native_functions,
+            static_dispatch_idx=static_dispatch_idx,
+            selector=selector,
+            backend_indices=backend_indices,
+            cpu_fm=cpu_fm,
+            cuda_fm=cuda_fm,
+            ops_fm=ops_fm,
+            dispatch_keys=dispatch_keys,
+            functions_keys=functions_keys,
+            rocm=rocm,
+        )
+    else:
+        gen_aggregated_headers(
+            native_functions=native_functions,
+            grouped_native_functions=grouped_native_functions,
+            structured_native_functions=structured_native_functions,
+            static_dispatch_idx=static_dispatch_idx,
+            selector=selector,
+            backend_indices=backend_indices,
+            cpu_fm=cpu_fm,
+            cuda_fm=cuda_fm,
+            dispatch_keys=dispatch_keys,
+            functions_keys=functions_keys,
+            rocm=rocm,
+        )
+
+    core_fm.write(
+        "TensorBody.h",
+        lambda: {
+            "tensor_method_declarations": list(
+                mapMaybe(
+                    ComputeTensorMethod(
+                        target=Target.DECLARATION,
+                        static_dispatch_backend_indices=static_dispatch_idx,
+                    ),
+                    native_functions,
+                )
+            ),
+            "tensor_method_definitions": list(
+                mapMaybe(
+                    ComputeTensorMethod(
+                        target=Target.DEFINITION,
+                        static_dispatch_backend_indices=static_dispatch_idx,
+                    ),
+                    native_functions,
+                )
+            ),
+        },
+    )
+
+    cpu_fm.write(
+        "RedispatchFunctions.h",
+        lambda: {
+            "function_redispatch_definitions": list(
+                mapMaybe(ComputeRedispatchFunction(), native_functions)
+            ),
+        },
+    )
+
+    cpu_fm.write(
+        "RegistrationDeclarations.h",
+        lambda: {
+            "registration_declarations": [
+                compute_registration_declarations(f, backend_indices)
+                for f in native_functions
+            ],
+        },
+    )
+
+    cpu_fm.write(
+        "VmapGeneratedPlumbing.h", lambda: gen_all_vmap_plumbing(native_functions)
+    )
+
+    def gen_aten_interned_strings() -> Dict[str, str]:
+        attrs = set()  # All function argument names
+        names = set()  # All ATen function names
+        for func in native_functions:
+            names.add(str(func.func.name.name))
+            # Some operators don't have a functional variant but we still create a
+            # symbol without the underscore
+            names.add(func.func.name.name.base)
+
+            for arg in func.func.schema_order_arguments():
+                attrs.add(arg.name)
+
+        # These are keywords in C++, so aren't valid symbol names
+        # https://en.cppreference.com/w/cpp/language/operator_alternative
+        names -= {
+            "and",
+            "and_eq",
+            "bitand",
+            "bitor",
+            "compl",
+            "not",
+            "not_eq",
+            "or",
+            "or_eq",
+            "xor",
+            "xor_eq",
+        }
+
+        return {
+            "aten_symbols": " \\\n".join(
+                [f"_(aten, {name})" for name in sorted(names)]
+            ),
+            "attr_symbols": " \\\n".join(
+                [f"_(attr, {name})" for name in sorted(attrs)]
+            ),
+        }
+
+    core_fm.write("aten_interned_strings.h", gen_aten_interned_strings)
+
+    def gen_tags_enum() -> Dict[str, str]:
+        return {"enum_of_valid_tags": (",\n".join(sorted(valid_tags)))}
+
+    core_fm.write("enum_tag.h", gen_tags_enum)
+
+
+def gen_source_files(
+    *,
+    native_functions: Sequence[NativeFunction],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    structured_native_functions: Sequence[NativeFunctionsGroup],
+    view_groups: Sequence[NativeFunctionsViewGroup],
+    selector: SelectiveBuilder,
+    static_dispatch_idx: List[BackendIndex],
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    aoti_fm: FileManager,
+    core_fm: FileManager,
+    cpu_fm: FileManager,
+    cpu_vec_fm: FileManager,
+    cuda_fm: FileManager,
+    dispatch_keys: Sequence[DispatchKey],
+    functions_keys: Set[DispatchKey],
+    rocm: bool,
+    force_schema_registration: bool,
+    per_operator_headers: bool,
+    skip_dispatcher_op_registration: bool,
+) -> None:
+    extra_cuda_headers = """\
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>
+#include <ATen/cuda/CUDADevice.h>
+#include <ATen/cuda/CUDAContext.h>"""
+    if rocm:
+        extra_cuda_headers = """\
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <ATen/hip/ATenHIPGeneral.h>
+#include <ATen/hip/HIPDevice.h>
+#include <ATen/hip/HIPContext.h>"""
+
+    for dispatch_key in dispatch_keys:
+        fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
+
+        if per_operator_headers:
+
+            def operator_headers() -> List[str]:
+                headers = []
+                for g in grouped_native_functions:
+                    is_registered = False
+                    if backend_index.has_kernel(g):
+                        is_registered = True
+                    # The above has_kernel test on a group will only test for
+                    # the existence of out dispatch, because that's how
+                    # structured kernels work. But sometimes functions can be
+                    # grouped but not be structured, and then you need to check
+                    # each individual piece, as they may have manual dispatch
+                    # entries.
+                    elif isinstance(g, NativeFunctionsGroup) and any(
+                        backend_index.has_kernel(fn) for fn in g.functions()
+                    ):
+                        is_registered = True
+                    # TODO: this condition is a bit questionable
+                    # (It has to do with the fact that structured kernels get generated kernels
+                    # to the Meta + CompositeExplicitAutogradNonFunctional keys).
+                    elif g.structured and dispatch_key in (
+                        DispatchKey.Meta,
+                        DispatchKey.CompositeExplicitAutogradNonFunctional,
+                    ):
+                        is_registered = True
+                    if not is_registered:
+                        continue
+
+                    headers.append(f"#include <ATen/ops/{g.root_name}_native.h>")
+                    if (
+                        dispatch_key
+                        == DispatchKey.CompositeExplicitAutogradNonFunctional
+                    ):
+                        headers.append(f"#include <ATen/ops/{g.root_name}.h>")
+                    if dispatch_key in functions_keys:
+                        headers.append(
+                            f"#include <ATen/ops/{g.root_name}_{dispatch_namespace}_dispatch.h>"
+                        )
+
+                return sorted(set(headers))
+
+        else:
+
+            def operator_headers() -> List[str]:
+                headers = ["#include <ATen/NativeFunctions.h>"]
+                if dispatch_key == DispatchKey.CompositeExplicitAutogradNonFunctional:
+                    headers.append("#include <ATen/Functions.h>")
+                if dispatch_key in functions_keys:
+                    headers.append(f"#include <ATen/{dispatch_key!s}Functions.h>")
+                return headers
+
+        backend_index = backend_indices[dispatch_key]
+        ns_grouped_native_functions = defaultdict(list)
+        for grouped_native_function in grouped_native_functions:
+            namespace = (
+                grouped_native_function.namespace
+                if isinstance(grouped_native_function, NativeFunction)
+                else grouped_native_function.functional.namespace
+            )
+            ns_grouped_native_functions[namespace].append(grouped_native_function)
+
+        dispatch_namespace = str(dispatch_key).lower()
+
+        # CompositeImplicitAutogradNestdTensor does not currently user the helpers generated
+        # compilation will fail when `-Werror=unused-function` flag is set
+        gen_dispatch_helpers: bool = (
+            dispatch_key != DispatchKey.CompositeImplicitAutogradNestedTensor
+        )
+
+        dispatch_definitions = get_native_function_definitions(
+            fm=fm,
+            grouped_native_functions=grouped_native_functions,
+            dispatch_key=dispatch_key,
+            backend_idx=backend_index,
+            selector=selector,
+            rocm=rocm,
+            symint=True,
+            skip_dispatcher_op_registration=skip_dispatcher_op_registration,
+            gen_dispatch_helpers=gen_dispatch_helpers,
+        )
+        fm.write_with_template(
+            f"Register{dispatch_key}.cpp",
+            "RegisterDispatchKey.cpp",
+            lambda: {
+                "extra_cuda_headers": extra_cuda_headers
+                if is_cuda_dispatch_key(dispatch_key)
+                else "",
+                "external_backend_headers": "",
+                "dispatch_headers": dest.gen_registration_headers(
+                    backend_index, per_operator_headers, rocm
+                ),
+                "ops_headers": operator_headers(),
+                "dispatch_helpers": "",
+                "dispatch_definitions": dispatch_definitions,
+            },
+        )
+
+        for g in structured_native_functions:
+            if not g.out.ufunc_inner_loop or not is_ufunc_dispatch_key(dispatch_key):
+                continue
+            name = g.functional.func.name.name
+            if dispatch_key is DispatchKey.CPU:
+                assert fm is cpu_fm
+                fm.write_with_template(
+                    f"UfuncCPU_{name}.cpp",
+                    "UfuncCPU.cpp",
+                    lambda: {
+                        "meta_declaration": compute_meta_function_declaration(g),
+                        "native_declaration": dest.compute_native_function_declaration(
+                            g, backend_indices[dispatch_key]
+                        ),
+                        "native_definitions": dest.compute_ufunc_cpu(g),
+                    },
+                )
+                cpu_vec_fm.write_with_template(
+                    f"UfuncCPUKernel_{name}.cpp",
+                    "UfuncCPUKernel.cpp",
+                    lambda: {
+                        "name": name,
+                        "native_definitions": dest.compute_ufunc_cpu_kernel(g),
+                    },
+                )
+            elif dispatch_key is DispatchKey.CUDA:
+                cuda_headers = "#include <ATen/native/cuda/Loops.cuh>"
+                if rocm:
+                    cuda_headers = "#include <ATen/native/hip/Loops.cuh>"
+                fm.write_with_template(
+                    f"UfuncCUDA_{name}.cu",
+                    "UfuncCUDA.cu",
+                    lambda: {
+                        "name": name,
+                        "cuda_headers": cuda_headers,
+                        "meta_declaration": compute_meta_function_declaration(g),
+                        "native_declaration": dest.compute_native_function_declaration(
+                            g, backend_indices[dispatch_key]
+                        ),
+                        "native_definitions": dest.compute_ufunc_cuda(g),
+                    },
+                )
+            else:
+                raise AssertionError(f"unrecognized {dispatch_key} for ufunc")
+
+        if dispatch_key in (DispatchKey.CPU, DispatchKey.CUDA):
+
+            def get_header(
+                f: NativeFunction,
+            ) -> Optional[str]:
+                backend_index = get_backend_index_for_aoti(
+                    f, dispatch_key, backend_indices
+                )
+                return (
+                    None
+                    if backend_index is None
+                    else f"#include <ATen/ops/{f.root_name}_{backend_index.dispatch_key.lower()}_dispatch.h>"
+                )
+
+            def headers_for_aoti() -> str:
+                headers = []
+                for g in grouped_native_functions:
+                    if isinstance(g, NativeFunctionsGroup):
+                        for f in g.functions():
+                            # some variants are registered in the backend, but some are registered as CompositeExplicitAutograd
+                            header = get_header(f)
+                            if header is not None:
+                                headers.append(header)
+                    else:
+                        header = get_header(g)
+                        if header is not None:
+                            headers.append(header)
+                return "\n".join(sorted(set(headers)))
+
+            extra_headers = (
+                extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else ""
+            )
+
+            aoti_fm.write(
+                f"c_shim_{dispatch_key.lower()}.h",
+                lambda: gen_aoti_c_shim(
+                    native_functions,
+                    dispatch_key,
+                    backend_indices,
+                    header=True,
+                    includes="",
+                ),
+            )
+            aoti_fm.write(
+                f"c_shim_{dispatch_key.lower()}.cpp",
+                lambda: gen_aoti_c_shim(
+                    native_functions,
+                    dispatch_key,
+                    backend_indices,
+                    header=False,
+                    includes=headers_for_aoti() + "\n" + extra_headers,
+                ),
+            )
+
+        del fm
+
+    # BackendSelect is generated specially
+    def gen_backend_select() -> Dict[str, List[str]]:
+        relevant_fns = [
+            fn for fn in native_functions if needs_backend_select(fn, selector)
+        ]
+        return {
+            "ops_headers": [
+                f"#include <ATen/ops/{fn.root_name}_ops.h>" for fn in relevant_fns
+            ],
+            "backend_select_method_definitions": list(
+                mapMaybe(
+                    ComputeBackendSelect(Target.DEFINITION, selector), relevant_fns
+                )
+            ),
+            "backend_select_function_registrations": list(
+                mapMaybe(
+                    ComputeBackendSelect(Target.REGISTRATION, selector), relevant_fns
+                )
+            ),
+        }
+
+    cpu_fm.write("RegisterBackendSelect.cpp", gen_backend_select)
+
+    schema_selector = selector
+    if force_schema_registration:
+        schema_selector = SelectiveBuilder.get_nop_selector()
+
+    (
+        aten_schema_registrations,
+        schema_registrations,
+    ) = get_native_function_schema_registrations(
+        native_functions=native_functions, schema_selector=schema_selector
+    )
+    cpu_fm.write(
+        "RegisterSchema.cpp",
+        lambda: {
+            "aten_schema_registrations": []
+            if skip_dispatcher_op_registration
+            else aten_schema_registrations,
+            "schema_registrations": []
+            if skip_dispatcher_op_registration
+            else schema_registrations,
+        },
+    )
+
+    def key_func(
+        fn: Union[NativeFunction, NativeFunctionsGroup, NativeFunctionsViewGroup]
+    ) -> str:
+        return fn.root_name
+
+    cpu_fm.write_sharded(
+        "Operators.cpp",
+        native_functions,
+        key_fn=key_func,
+        env_callable=lambda fn: {
+            "operator_headers": [f"#include <ATen/ops/{fn.root_name}.h>"],
+            "definitions": [
+                ComputeOperators(
+                    Target.DEFINITION,
+                    static_dispatch_backend_indices=static_dispatch_idx,
+                )(fn)
+            ],
+        },
+        base_env={
+            "static_dispatch_extra_headers": static_dispatch_extra_headers(
+                static_dispatch_idx
+            ),
+        },
+        num_shards=5,
+        sharded_keys={
+            "operator_headers",
+            "definitions",
+            "static_dispatch_extra_headers",
+        },
+    )
+
+    cpu_fm.write("Functions.cpp", dict)
+
+    core_fm.write("TensorMethods.cpp", dict)
+
+    core_fm.write(
+        "ATenOpList.cpp",
+        lambda: {
+            "aten_ops": list(mapMaybe(compute_aten_op, native_functions)),
+        },
+    )
+
+    def functionalization_env_callable(
+        g: Union[NativeFunction, NativeFunctionsGroup, NativeFunctionsViewGroup]
+    ) -> Dict[str, List[str]]:
+        def gen_op_headers(
+            g: Union[NativeFunction, NativeFunctionsGroup, NativeFunctionsViewGroup]
+        ) -> List[str]:
+            if isinstance(g, NativeFunctionsViewGroup):
+                # view ops always get a functionalization kernel
+                headers = [
+                    f"#include <ATen/ops/{g.view.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.view.root_name}_ops.h>",
+                ]
+                if g.view_copy is not None:
+                    headers += [
+                        f"#include <ATen/ops/{g.view_copy.root_name}_native.h>",
+                        f"#include <ATen/ops/{g.view_copy.root_name}_ops.h>",
+                    ]
+                return headers
+            elif isinstance(g, NativeFunctionsGroup):
+                headers = [
+                    f"#include <ATen/ops/{g.functional.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.functional.root_name}_ops.h>",
+                    f"#include <ATen/ops/{g.out.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.out.root_name}_ops.h>",
+                ]
+                if g.inplace is not None:
+                    headers += [
+                        f"#include <ATen/ops/{g.inplace.root_name}_native.h>",
+                        f"#include <ATen/ops/{g.inplace.root_name}_ops.h>",
+                    ]
+                if g.mutable is not None:
+                    headers += [
+                        f"#include <ATen/ops/{g.mutable.root_name}_native.h>",
+                        f"#include <ATen/ops/{g.mutable.root_name}_ops.h>",
+                    ]
+                return headers
+            else:
+                return [
+                    f"#include <ATen/ops/{g.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.root_name}_ops.h>",
+                ]
+
+        return {
+            "ops_headers": gen_op_headers(g),
+            "func_definitions": gen_functionalization_definition(
+                selector,
+                g,
+            ),
+            "func_registrations": gen_functionalization_registration(
+                selector,
+                g,
+                backend_indices[DispatchKey.CompositeImplicitAutograd],
+            ),
+        }
+
+    all_groups: List[
+        Union[NativeFunction, NativeFunctionsGroup, NativeFunctionsViewGroup]
+    ] = list(structured_native_functions) + list(
+        view_groups  # type: ignore[assignment, arg-type, operator]
+    )
+    # Note: all operators that functionalization needs to handle (mutable and aliasing ops) should be grouped properly.
+    # The only reason we really need to deal with direct NativeFunctions here (instead of the groups) is because:
+    # (1) We can provide better error checking (error out if someone introduces a mutable op that doesn't obey the grouping logic)
+    # (2) functionalization needs to manually register CompositeImplicitAutograd kernels, which might not be grouped.
+    #     Although this could go away long-term if we add a dedicated dispatch key for decompositions.
+    structured_map: Dict[OperatorName, NativeFunction] = {
+        f.func.name: f
+        for f in concatMap(lambda g: list(g.functions()), structured_native_functions)
+    }
+    view_map: Dict[OperatorName, NativeFunction] = {
+        f.func.name: f for f in concatMap(lambda g: list(g.functions()), view_groups)
+    }
+    for f in native_functions:
+        if f.func.name not in structured_map and f.func.name not in view_map:
+            all_groups.append(f)
+
+    cpu_fm.write_sharded(
+        "RegisterFunctionalization.cpp",
+        all_groups,
+        key_fn=key_func,
+        env_callable=functionalization_env_callable,
+        num_shards=4,
+        sharded_keys={
+            "ops_headers",
+            "func_definitions",
+            "func_registrations",
+            "func_add_back_views_definitions",
+            "func_add_back_views_registrations",
+        },
+    )
+
+    cpu_fm.write(
+        "FunctionalInverses.h",
+        lambda: {
+            "view_inverse_declarations": list(
+                mapMaybe(
+                    lambda g: gen_functionalization_view_inverse_declaration(
+                        selector, g
+                    ),
+                    view_groups,
+                )
+            )
+        },
+    )
+
+    # Note [view_copy NativeFunctions]
+    # Every view operator in native_functions.yaml that is not CompositeImplicitAutograd
+    # needs to have a corresponding non-aliasing {view}_copy variant.
+    # Backends that use functionalization and don't know how to handle aliasing ops
+    # are expected to implement kernels for these {view}_copy kernels instead.
+    # The code for {view}_copy operators in core is pretty boilerplate-heavy however,
+    # so we codegen the following:
+    # (1) A CompositeExplicitAutogradNonFunctional kernel for every {view}_copy operator.
+    #     These are never explicitly invoked by the functionalization pass,
+    #     but they could theoretically be called from user code (I added these kernels for completeness,
+    #     since the ops are part of the public API).
+    # (2) A derivative formula for every {view}_copy operator
+    #     {view}_copy operators can re-use the same derivative formulas as their {view} op counterparts,
+    #     so rather than stamping all of the entries out in derivatives.yaml,
+    #     we codegen them in.
+    #     This is similar to how autograd codegen doesn't require inplace ops to have a derivatives.yaml entry.
+    cpu_fm.write(
+        "CompositeViewCopyKernels.cpp",
+        lambda: {
+            "ops_headers": [
+                "\n".join(
+                    f"#include <ATen/ops/{f.root_name}_ops.h>\n"
+                    # NB: this include is important as it ensures we
+                    # set the visibility on generated view_copy kernels
+                    # correctly
+                    f"#include <ATen/ops/{f.root_name}_native.h>"
+                    for f in (
+                        [g.view] if g.view_copy is None else [g.view, g.view_copy]
+                    )
+                )
+                for g in view_groups
+            ]
+            + [
+                "\n".join(
+                    f"#include <ATen/ops/{f.root_name}_ops.h>"
+                    for f in [g.inplace, g.mutable, g.functional]
+                    if f is not None and "generated" not in f.tags
+                )
+                for g in structured_native_functions
+            ],
+            "CompositeViewCopyKernel_Definitions": list(
+                mapMaybe(
+                    GenCompositeViewCopyKernel(
+                        backend_indices[
+                            DispatchKey.CompositeExplicitAutogradNonFunctional
+                        ]
+                    ),
+                    view_groups,
+                )
+            ),
+            "GeneratedCompositeFunctional_Definitions": list(
+                mapMaybe(
+                    gen_composite_functional_kernel,
+                    structured_native_functions,
+                )
+            ),
+            "GeneratedCompositeOut_Definitions": list(
+                mapMaybe(
+                    gen_composite_out_kernel,
+                    structured_native_functions,
+                )
+            ),
+        },
+    )
+
+
+def gen_declarations_yaml(
+    cpu_fm: FileManager, native_functions: Sequence[NativeFunction]
+) -> None:
+    cpu_fm.write(
+        "Declarations.yaml",
+        lambda: format_yaml([compute_declaration_yaml(f) for f in native_functions]),
+    )
+
+
+def get_torchgen_root() -> pathlib.Path:
+    """
+    If you're depending on torchgen out-of-tree, you can use the root to figure
+    out the path to native_functions.yaml
+    """
+    return pathlib.Path(__file__).parent.resolve()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate ATen source files")
+    parser.add_argument(
+        "-s",
+        "--source-path",
+        help="path to source directory for ATen",
+        default="aten/src/ATen",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-dependencies",
+        help="output a list of dependencies into the given file and exit",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="run without writing any files (still updates outputs)",
+    )
+    parser.add_argument(
+        "--per-operator-headers",
+        action="store_true",
+        help="generate separate headers per operator in ATen/ops",
+    )
+    parser.add_argument(
+        "-d",
+        "--install-dir",
+        "--install_dir",
+        help="output directory",
+        default="build/aten/src/ATen",
+    )
+    parser.add_argument(
+        "--rocm",
+        action="store_true",
+        help="reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly",
+    )
+    parser.add_argument(
+        "--mps",
+        action="store_true",
+        help="Generate MPS registration code when set",
+    )
+    # TODO: --op-registration-whitelist will be removed when all call-sites
+    # for gen.py are moved over to using the operator YAML file for mobile
+    # custom build.
+    parser.add_argument(
+        "--op-registration-whitelist",
+        "--op_registration_whitelist",
+        nargs="*",
+        help="filter op registrations by the whitelist (if set); "
+        "each item is `namespace`::`operator name` without overload name; "
+        "e.g.: aten::empty aten::conv2d ...",
+    )
+    parser.add_argument(
+        "--op-selection-yaml-path",
+        "--op_selection_yaml_path",
+        help="Provide a path to the operator selection (for custom build) YAML "
+        "that contains the information about the set of selected operators "
+        "and their categories (training, ...). Each operator is either a "
+        "full operator name with overload or just a bare operator name. "
+        "The operator names also contain the namespace prefix (e.g. aten::)",
+    )
+    parser.add_argument(
+        "--backend-whitelist",
+        "--backend_whitelist",
+        nargs="*",
+        help="filter dispatch backend by the whitelist (if set), "
+        "e.g.: CPU CUDA QuantizedCPU ...",
+    )
+    parser.add_argument(
+        "--static-dispatch-backend",
+        "--static_dispatch_backend",
+        nargs="*",
+        help="generate static dispatch code for the specific backend (if set)",
+    )
+    parser.add_argument(
+        "--skip-dispatcher-op-registration",
+        "--skip_dispatcher_op_registration",
+        action="store_true",
+        help="Avoid registering operators into the dispatcher.",
+    )
+    parser.add_argument(
+        "--force-schema-registration",
+        "--force_schema_registration",
+        action="store_true",
+        help="force it to generate schema-only registrations for all ops, including"
+        "those that are not listed on --op-registration-whitelist",
+    )
+    parser.add_argument(
+        "--generate",
+        type=str,
+        nargs="*",
+        choices=["headers", "sources", "declarations_yaml"],
+        default=["headers", "sources", "declarations_yaml"],
+        help="Generate only a subset of files",
+    )
+
+    options = parser.parse_args()
+
+    selector = get_custom_build_selector(
+        options.op_registration_whitelist,
+        options.op_selection_yaml_path,
+    )
+
+    native_yaml_path = os.path.join(options.source_path, "native/native_functions.yaml")
+    tags_yaml_path = os.path.join(options.source_path, "native/tags.yaml")
+
+    from torchgen.model import dispatch_keys
+
+    # TODO: stop generating CUDA kernels for non-CUDA builds
+    ignore_keys = set()
+    if not options.mps:
+        ignore_keys.add(DispatchKey.MPS)
+
+        if DispatchKey.MPS in dispatch_keys:
+            del dispatch_keys[dispatch_keys.index(DispatchKey.MPS)]
+
+    parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path, ignore_keys)
+    valid_tags = _GLOBAL_PARSE_TAGS_YAML_CACHE[tags_yaml_path]
+    native_functions, backend_indices = (
+        parsed_yaml.native_functions,
+        parsed_yaml.backend_indices,
+    )
+
+    grouped_native_functions = get_grouped_native_functions(native_functions)
+
+    structured_native_functions = [
+        g for g in grouped_native_functions if isinstance(g, NativeFunctionsGroup)
+    ]
+    native_functions_with_view_groups = get_grouped_by_view_native_functions(
+        native_functions
+    )
+    view_groups = [
+        g
+        for g in native_functions_with_view_groups
+        if isinstance(g, NativeFunctionsViewGroup)
+    ]
+
+    # NB: It is mandatory to NOT use os.path.join here, as the install directory
+    # will eventually be ingested by cmake, which does not respect Windows style
+    # path slashes.  If you switch this to use os.path.join, you'll get an error
+    # like:
+    #
+    #   Syntax error in cmake code when parsing string
+    #
+    #     C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h
+    #
+    #   Invalid character escape '\c'.
+    core_install_dir = f"{options.install_dir}/core"
+    pathlib.Path(core_install_dir).mkdir(parents=True, exist_ok=True)
+    ops_install_dir = f"{options.install_dir}/ops"
+    pathlib.Path(ops_install_dir).mkdir(parents=True, exist_ok=True)
+
+    core_fm = make_file_manager(options=options, install_dir=core_install_dir)
+    cpu_fm = make_file_manager(options=options)
+    cpu_vec_fm = make_file_manager(options=options)
+    cuda_fm = make_file_manager(options=options)
+    ops_fm = make_file_manager(options=options, install_dir=ops_install_dir)
+    aoti_fm = make_file_manager(
+        options=options, install_dir="torch/csrc/inductor/aoti_torch/generated"
+    )
+
+    # Only a limited set of dispatch keys get CPUFunctions.h headers generated
+    # for them; this is the set
+    functions_keys = {
+        DispatchKey.CPU,
+        DispatchKey.CUDA,
+        DispatchKey.CompositeImplicitAutograd,
+        DispatchKey.CompositeImplicitAutogradNestedTensor,
+        DispatchKey.CompositeExplicitAutograd,
+        DispatchKey.CompositeExplicitAutogradNonFunctional,
+        DispatchKey.Meta,
+    }
+    if options.mps:
+        functions_keys.add(DispatchKey.MPS)
+
+    if options.backend_whitelist:
+        dispatch_keys = [
+            k
+            for k in dispatch_keys
+            if is_generic_dispatch_key(k) or str(k) in options.backend_whitelist
+        ]
+
+    static_dispatch_idx: List[BackendIndex] = []
+    if options.static_dispatch_backend:
+        static_dispatch_idx = [
+            backend_indices[DispatchKey.parse(key)]
+            for key in options.static_dispatch_backend
+        ]
+        for key in options.static_dispatch_backend:
+            dp_key = DispatchKey.parse(key)
+            if dp_key not in functions_keys:
+                functions_keys.add(dp_key)
+
+    if "sources" in options.generate:
+        gen_source_files(
+            native_functions=native_functions,
+            grouped_native_functions=grouped_native_functions,
+            structured_native_functions=structured_native_functions,
+            view_groups=view_groups,
+            selector=selector,
+            static_dispatch_idx=static_dispatch_idx,
+            backend_indices=backend_indices,
+            aoti_fm=aoti_fm,
+            core_fm=core_fm,
+            cpu_fm=cpu_fm,
+            cpu_vec_fm=cpu_vec_fm,
+            cuda_fm=cuda_fm,
+            dispatch_keys=dispatch_keys,
+            functions_keys=functions_keys,
+            rocm=options.rocm,
+            force_schema_registration=options.force_schema_registration,
+            per_operator_headers=options.per_operator_headers,
+            skip_dispatcher_op_registration=options.skip_dispatcher_op_registration,
+        )
+
+    if "headers" in options.generate:
+        gen_headers(
+            native_functions=native_functions,
+            valid_tags=valid_tags,
+            grouped_native_functions=grouped_native_functions,
+            structured_native_functions=structured_native_functions,
+            static_dispatch_idx=static_dispatch_idx,
+            selector=selector,
+            backend_indices=backend_indices,
+            core_fm=core_fm,
+            cpu_fm=cpu_fm,
+            cuda_fm=cuda_fm,
+            ops_fm=ops_fm,
+            dispatch_keys=dispatch_keys,
+            functions_keys=functions_keys,
+            rocm=options.rocm,
+            per_operator_headers=options.per_operator_headers,
+        )
+
+    if "declarations_yaml" in options.generate:
+        gen_declarations_yaml(native_functions=native_functions, cpu_fm=cpu_fm)
+
+    if options.output_dependencies:
+        depfile_path = pathlib.Path(options.output_dependencies).resolve()
+        depfile_name = depfile_path.name
+        depfile_stem = depfile_path.stem
+
+        for fm, prefix in [
+            (cpu_fm, ""),
+            (cpu_vec_fm, "cpu_vec_"),
+            (core_fm, "core_"),
+            (cuda_fm, "cuda_"),
+            (ops_fm, "ops_"),
+        ]:
+            varname = prefix + depfile_stem
+            path = depfile_path.parent / (prefix + depfile_name)
+            fm.write_outputs(varname, str(path))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torchgen/gen_aoti_c_shim.py b/MLPY/Lib/site-packages/torchgen/gen_aoti_c_shim.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9ba23a43ab69962f98c926f562ed8c58029355a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/gen_aoti_c_shim.py
@@ -0,0 +1,431 @@
+import textwrap
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+from torchgen.api.types import DispatcherSignature
+from torchgen.api.types.signatures import CppSignature, CppSignatureGroup
+
+from torchgen.context import method_with_native_function
+from torchgen.model import (
+    Argument,
+    BackendIndex,
+    BaseTy,
+    BaseType,
+    DispatchKey,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Type,
+)
+from torchgen.utils import mapMaybe
+
+
+def returns_are_all_tensor(schema: FunctionSchema) -> bool:
+    return len(schema.returns) != 0 and all(
+        ret.type.is_tensor_like() for ret in schema.returns
+    )
+
+
+base_type_to_c_type = {
+    BaseTy.Tensor: "AtenTensorHandle",
+    BaseTy.bool: "int32_t",  # Use int to pass bool
+    BaseTy.int: "int64_t",
+    BaseTy.SymInt: "int64_t",  # Inductor-generated code won't see a SymInt
+    BaseTy.Scalar: "double",  # Use double to pass both integer and floating point
+    BaseTy.float: "double",  # TODO: how about other floating point types?
+    BaseTy.str: "const char*",
+    BaseTy.DeviceIndex: "int32_t",
+    BaseTy.Layout: "int32_t",  # Represent enum as int
+    BaseTy.MemoryFormat: "int32_t",  # Represent enum as int
+    BaseTy.ScalarType: "int32_t",  # Represent enum as int
+}
+
+base_type_to_aten_type = {
+    BaseTy.Tensor: "at::Tensor",
+    BaseTy.bool: "bool",
+    BaseTy.int: "int64_t",
+    BaseTy.SymInt: "c10::SymInt",
+    BaseTy.Scalar: "c10::Scalar",
+    BaseTy.float: "double",
+    BaseTy.str: "c10::string_view",
+    BaseTy.DeviceIndex: "c10::DeviceIndex",
+    BaseTy.Layout: "c10::Layout",
+    BaseTy.MemoryFormat: "c10::MemoryFormat",
+    BaseTy.ScalarType: "c10::ScalarType",
+}
+
+base_type_to_callsite_expr = {
+    BaseTy.Tensor: "*tensor_handle_to_tensor_pointer",
+    BaseTy.bool: "",
+    BaseTy.int: "",
+    BaseTy.SymInt: "",
+    BaseTy.Scalar: "",
+    BaseTy.float: "",
+    BaseTy.str: "",
+    BaseTy.DeviceIndex: "static_cast<c10::DeviceIndex>",
+    BaseTy.Layout: "static_cast<c10::Layout>",
+    BaseTy.MemoryFormat: "static_cast<c10::MemoryFormat>",
+    BaseTy.ScalarType: "static_cast<c10::ScalarType>",
+}
+
+
+# convert args to C types, names in declarations, and expressions in function bodies
+def convert_arg_type_and_name(typ: Type, name: str) -> Tuple[List[str], List[str], List[str], List[str]]:  # type: ignore[return]
+    if isinstance(typ, BaseType):
+        if typ.name in base_type_to_c_type:
+            return (
+                [base_type_to_c_type[typ.name]],
+                [name],
+                [base_type_to_aten_type[typ.name]],
+                [
+                    f"{base_type_to_callsite_expr[typ.name]}({name})"
+                    if base_type_to_callsite_expr[typ.name]
+                    else name
+                ],
+            )
+        elif typ.name == BaseTy.Device:
+            return (
+                ["int32_t", "int32_t"],
+                [name, name + "_index_"],
+                ["c10::Device"],
+                [
+                    f"c10::Device(static_cast<c10::DeviceType>({name}), static_cast<c10::DeviceIndex>({name}_index_))"
+                ],
+            )
+        else:
+            # TODO: BaseTy.Dimname, BaseTy.Generator, etc.
+            raise NotImplementedError(f"TODO: add support for arg type {repr(typ)}")
+    elif isinstance(typ, OptionalType):
+        c_types, names, aten_types, callsite_exprs = convert_arg_type_and_name(
+            typ.elem, name
+        )
+        j = 0  # index for names
+        new_aten_types = []
+        new_callsite_exprs = []
+        for i, aten_type in enumerate(aten_types):
+            # Use pointer to denote optional type
+            c_types[j] = c_types[j] + "*"
+            if aten_type.startswith("c10::ArrayRef<"):
+                # ArrayRef is passed as pointer + size, but no need to add "*" to the size argument
+                new_aten_types.append(f"c10::optional<{aten_type}>")
+                base_type = aten_type[len("c10::ArrayRef<") : -1]
+                new_callsite_exprs.append(
+                    f"pointer_to_optional_list<{base_type}>({names[j]}, {names[j+1]})"
+                )
+                j += 2
+            elif aten_type == "c10::Device":
+                # Device is passed as device_type + device_index
+                new_aten_types.append("c10::optional<c10::Device>")
+                new_callsite_exprs.append(
+                    f"pointer_to_optional_device({names[j]}, {names[j+1]})"
+                )
+                j += 2
+            else:
+                new_aten_types.append(f"c10::optional<{aten_type}>")
+                new_callsite_exprs.append(
+                    f"pointer_to_optional<{aten_type}>({names[j]})"
+                )
+                j += 1
+
+        return (
+            c_types,
+            names,
+            new_aten_types,
+            new_callsite_exprs,
+        )
+    elif isinstance(typ, ListType):
+        # Need to explictly pass the list as pointer + length
+        c_types, names, aten_types, _ = convert_arg_type_and_name(typ.elem, name)
+        assert len(c_types) == 1, "ListType with unsupported element type " + repr(typ)
+
+        # The list content should never be modified
+        c_types[0] = f"const {c_types[0]}*"
+        c_types.append("int64_t")
+        name = names[0]
+        names.append(name + "_len_")
+
+        atype = aten_types[0]
+        callsite_exprs = []
+        if atype == "bool":
+            # no converter from std::vector<bool> to c10::ArrayRef<bool>
+            # construct std::array<bool, N> instead
+            assert typ.size is not None
+            callsite_exprs.append(f"pointer_to_list<{typ.size}>({name})")
+        elif atype == "c10::optional<at::Tensor>":
+            # convert from std::vector<c10::optional<at::Tensor>> to c10::List<c10::optional<at::Tensor>>
+            callsite_exprs.append(
+                f"c10::List<{atype}>(c10::ArrayRef<{atype}>(pointer_to_list<{atype}>({name}, {name}_len_)))"
+            )
+        else:
+            callsite_exprs.append(f"pointer_to_list<{atype}>({name}, {name}_len_)")
+
+        aten_types = [f"c10::ArrayRef<{t}>" for t in aten_types]
+        return (
+            c_types,
+            names,
+            aten_types,
+            callsite_exprs,
+        )
+
+
+def zip_type_and_name(types: List[str], names: List[str]) -> List[str]:
+    return [typ + " " + name for typ, name in zip(types, names)]
+
+
+# Generate argument declarations and callsite expressions
+def gen_arguments(flat_arguments: Sequence[Argument]) -> Tuple[List[str], List[str]]:
+    types = []
+    new_names = []
+    callsite_exprs = []
+    for arg in flat_arguments:
+        new_types, names, _, new_callsite_exprs = convert_arg_type_and_name(
+            arg.type, arg.name
+        )
+        types.extend(new_types)
+        new_names.extend(names)
+        callsite_exprs.extend(new_callsite_exprs)
+    return zip_type_and_name(types, new_names), callsite_exprs
+
+
+# Return values are passed out as pointer arguments because all the C shim functions
+# are expected to return AOTITorchError.
+# Generate returns as declarations and callsite expressions
+def gen_returns(schema: FunctionSchema) -> Tuple[List[str], List[str]]:
+    types = []
+    names = []
+    for idx, ret in enumerate(schema.returns):
+        names.append(f"ret{idx}")
+        if isinstance(ret.type, BaseType) and ret.type.name in base_type_to_c_type:
+            types.append(base_type_to_c_type[ret.type.name] + "*")
+        else:
+            raise NotImplementedError(
+                f"TODO: add support for return type {repr(ret.type)}"
+            )
+
+    def convert_return(typ: BaseType, val: str) -> str:
+        if typ.name == BaseTy.Tensor:
+            return f"new_tensor_handle(std::move({val}));"
+        elif typ.name == BaseTy.SymInt:
+            return f"{val}.expect_int()"
+        elif typ.name == BaseTy.Scalar:
+            return f"{val}.toDouble()"
+        else:
+            return val
+
+    ret_pointer_can_be_null = False
+    unambiguous_name = schema.name.unambiguous_name()
+    for name in ["_scaled_dot_product_flash_attention"]:
+        if name in unambiguous_name:
+            ret_pointer_can_be_null = True
+            break
+
+    callsite_exprs: List[str] = []
+    for idx, ret in enumerate(schema.returns):
+        tmp = "tmp_result" if len(names) == 1 else f"std::get<{idx}>(tmp_result)"
+        assert isinstance(ret.type, BaseType)
+        rval = convert_return(ret.type, tmp)
+        if ret_pointer_can_be_null:
+            callsite_exprs.append(f"if ({names[idx]}) {{ *{names[idx]} = {rval}; }}")
+        else:
+            callsite_exprs.append(f"*{names[idx]} = {rval};")
+
+    return zip_type_and_name(types, names), callsite_exprs
+
+
+# gen.py generates header first and then src, so caching the result here to avoid duplicate work
+declaration_definition_cache: Dict[Tuple[str, str, str], Tuple[str, str]] = {}
+
+
+def gen_declaration_and_definition(
+    schema: FunctionSchema, device: str, backend_call: str
+) -> Tuple[str, str]:
+    func_name = schema.name.unambiguous_name()
+
+    global declaration_definition_cache
+    if (func_name, device, backend_call) in declaration_definition_cache:
+        return declaration_definition_cache[(func_name, device, backend_call)]
+
+    if schema.is_out_fn():
+        # out_variant has out arguments in the front, and it's ok to ignore return value
+        # because C shim functions only return AOTITorchError
+        # Somehow at::native out-variant functions have out arguments in the back
+        args, callsite_exprs = gen_arguments(
+            [*schema.arguments.flat_non_out, *schema.arguments.out]
+            if "at::native" in backend_call
+            else [*schema.arguments.out, *schema.arguments.flat_non_out],
+        )
+        ret_assignments: List[str] = []
+    else:
+        args, callsite_exprs = gen_arguments(schema.arguments.flat_all)
+        ret_declarations, ret_assignments = gen_returns(schema)
+        args.extend(ret_declarations)
+
+    declaration = f"AOTITorchError aoti_torch_{device}_{func_name}({', '.join(args)})"
+
+    tmp_result = "auto tmp_result = " if ret_assignments else ""
+    ret_assignments_str = "\n" + "\n".join(ret_assignments) if ret_assignments else ""
+    definition = f"""
+{declaration} {{
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({{
+        {tmp_result}{backend_call}(
+{textwrap.indent(', '.join(callsite_exprs), "            ")}
+        );{textwrap.indent(ret_assignments_str, "        ")}
+    }});
+}}
+"""
+    declaration_definition_cache[(func_name, device, backend_call)] = (
+        declaration,
+        definition,
+    )
+    return declaration, definition
+
+
+def gen_static_dispatch_backend_call_signature(
+    sig: Union[CppSignature, DispatcherSignature],
+    f: NativeFunction,
+) -> CppSignature:
+    sig = DispatcherSignature.from_schema(f.func)
+    cpp_sigs = CppSignatureGroup.from_native_function(
+        f, method=False, fallback_binding=False
+    )
+    if sig.symint and f.func.has_symint():
+        cpp_sig = cpp_sigs.symint_signature
+    else:
+        cpp_sig = cpp_sigs.signature
+    assert cpp_sig is not None
+    return cpp_sig
+
+
+def gen_static_dispatch_backend_call(
+    f: NativeFunction,
+    backend_index: BackendIndex,
+) -> str:
+    assert backend_index.has_kernel(f)
+    sig = DispatcherSignature.from_schema(f.func)
+    cpp_sig = gen_static_dispatch_backend_call_signature(sig, f)
+    return f"at::{backend_index.dispatch_key.lower()}::{cpp_sig.name()}"
+
+
+def get_backend_index_for_aoti(
+    f: NativeFunction,
+    dispatch_key: DispatchKey,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+) -> Optional[BackendIndex]:
+    if "pointwise" in f.tags:
+        # TODO: No need to generate C shim for Inductor lowered ops.
+        # Only skip pointwise kernels for now, and we can add more tags later.
+        return None
+
+    backend_index = None
+    if backend_indices[dispatch_key].has_kernel(f):
+        backend_index = backend_indices[dispatch_key]
+    elif backend_indices[DispatchKey.CompositeExplicitAutograd].has_kernel(f):
+        # We need to create C shim wrappers for CompositeExplicitAutograd kernels
+        backend_index = backend_indices[DispatchKey.CompositeExplicitAutograd]
+    elif backend_indices[DispatchKey.CompositeExplicitAutogradNonFunctional].has_kernel(
+        f
+    ):
+        # We need to create C shim wrappers for CompositeExplicitAutogradNonFunctional kernels
+        backend_index = backend_indices[
+            DispatchKey.CompositeExplicitAutogradNonFunctional
+        ]
+    return backend_index
+
+
+def gen_c_shim(
+    f: NativeFunction,
+    dispatch_key: DispatchKey,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    header: bool,
+) -> Optional[str]:
+    backend_index = get_backend_index_for_aoti(f, dispatch_key, backend_indices)
+    if backend_index is None:
+        return None
+
+    schema = f.func
+    device = dispatch_key.lower()
+    backend_call = gen_static_dispatch_backend_call(
+        f,
+        backend_index,
+    )
+
+    try:
+        if header:
+            declaration, _ = gen_declaration_and_definition(
+                schema, device, backend_call
+            )
+            return f"AOTI_TORCH_EXPORT {declaration};"
+        else:
+            _, definition = gen_declaration_and_definition(schema, device, backend_call)
+            return definition
+
+    except NotImplementedError:
+        return None
+
+
+@dataclass(frozen=True)
+class ShimGenerator:
+    dispatch_key: DispatchKey
+    backend_indices: Dict[DispatchKey, BackendIndex]
+    header: bool  # True to generate .h and False to generate .cpp
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        result = gen_c_shim(f, self.dispatch_key, self.backend_indices, self.header)
+        return result
+
+
+def gen_aoti_c_shim(
+    native_functions: Sequence[NativeFunction],
+    dispatch_key: DispatchKey,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    header: bool,
+    includes: str = "",
+) -> str:
+    body = "\n".join(
+        list(
+            mapMaybe(
+                ShimGenerator(dispatch_key, backend_indices, header),
+                native_functions,
+            )
+        )
+    )
+
+    if header:
+        return f"""
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {{
+#endif
+
+{body}
+
+#ifdef __cplusplus
+}} // extern "C"
+#endif
+
+"""
+    else:
+        device = dispatch_key.lower()
+        return f"""
+#include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_{device}.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/{str(dispatch_key)}Functions.h>
+#include <ATen/CompositeExplicitAutogradFunctions.h>
+#include <ATen/CompositeExplicitAutogradNonFunctionalFunctions.h>
+#else
+{includes}
+#endif
+
+using namespace torch::aot_inductor;
+
+{body}
+
+"""
diff --git a/MLPY/Lib/site-packages/torchgen/gen_backend_stubs.py b/MLPY/Lib/site-packages/torchgen/gen_backend_stubs.py
new file mode 100644
index 0000000000000000000000000000000000000000..6694c8dcf29e8917a6b763d617ebb7c646bae3dd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/gen_backend_stubs.py
@@ -0,0 +1,609 @@
+import argparse
+import os
+import pathlib
+import re
+from collections import Counter, defaultdict, namedtuple
+from typing import Dict, List, Optional, Sequence, Set, Union
+
+import yaml
+
+import torchgen.api.dispatcher as dispatcher
+import torchgen.dest as dest
+from torchgen.api.types import DispatcherSignature
+from torchgen.code_template import CodeTemplate
+from torchgen.context import native_function_manager
+from torchgen.gen import get_grouped_native_functions, parse_native_yaml
+from torchgen.model import (
+    BackendIndex,
+    BackendMetadata,
+    DispatchKey,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OperatorName,
+)
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import concatMap, context, FileManager, NamespaceHelper, Target
+from torchgen.yaml_utils import YamlLoader
+
+
+# Parses the external backend's yaml, and adds a new BackendIndex for the backend's dispatch key.
+# Returns a Tuple of (backend_key, autograd_key, cpp_namespace, updated BackendIndex mapping)
+ParsedExternalYaml = namedtuple(
+    "ParsedExternalYaml",
+    ["backend_key", "autograd_key", "class_name", "cpp_namespace", "backend_indices"],
+)
+
+
+def parse_backend_yaml(
+    backend_yaml_path: str,
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    backend_indices: Dict[DispatchKey, BackendIndex],
+) -> ParsedExternalYaml:
+    native_functions_map: Dict[OperatorName, NativeFunction] = {
+        f.func.name: f
+        for f in concatMap(
+            lambda f: [f] if isinstance(f, NativeFunction) else list(f.functions()),
+            grouped_native_functions,
+        )
+    }
+
+    with open(backend_yaml_path) as f:
+        yaml_values = yaml.load(f, Loader=YamlLoader)
+    assert isinstance(yaml_values, dict)
+
+    valid_keys = [
+        "backend",
+        "class_name",
+        "cpp_namespace",
+        "extra_headers",
+        "supported",
+        "autograd",
+        "full_codegen",
+        "non_native",
+        "ir_gen",
+        "symint",
+    ]
+
+    backend = yaml_values.pop("backend", None)
+    assert backend is not None, 'You must provide a value for "backend"'
+
+    class_name = yaml_values.pop("class_name", None)
+
+    cpp_namespace = yaml_values.pop("cpp_namespace", None)
+    assert cpp_namespace is not None, 'You must provide a value for "cpp_namespace"'
+
+    # Mostly just defaulting to false to stick with LazyTensor convention.
+    use_out_as_primary = yaml_values.pop("use_out_as_primary", False)
+    assert isinstance(
+        use_out_as_primary, bool
+    ), f"You must provide either True or False for use_out_as_primary. Provided: {use_out_as_primary}"
+
+    use_device_guard = yaml_values.pop("device_guard", False)
+    assert isinstance(
+        use_device_guard, bool
+    ), f"You must provide either True or False for device_guard. Provided: {use_device_guard}"
+
+    supported = yaml_values.pop("supported", [])
+    if supported is None:
+        supported = []  # Allow an empty list of supported ops
+    assert isinstance(
+        supported, list
+    ), f'expected "supported" to be a list, but got: {supported} (of type {type(supported)})'
+
+    symint = yaml_values.pop("symint", [])
+    if symint is None:
+        symint = []  # Allow an empty list of symint ops
+    assert isinstance(
+        symint, list
+    ), f'expected "symint" to be a list, but got: {supported} (of type {type(supported)})'
+    symint_set = set(symint)
+
+    supported_autograd = yaml_values.pop("autograd", [])
+    assert isinstance(
+        supported_autograd, list
+    ), f'expected "autograd" to be a list, but got: {supported_autograd}'
+
+    # full_codegen is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py
+    full_codegen = yaml_values.pop("full_codegen", [])
+    supported.extend(full_codegen)
+
+    # non_native is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py
+    non_native = yaml_values.pop("non_native", {})
+
+    # ir_gen is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py
+    _ = yaml_values.pop("ir_gen", {})
+
+    assert (
+        len(yaml_values.keys()) == 0
+    ), f'{backend_yaml_path} contains unexpected keys: {", ".join(yaml_values.keys())}. \
+Only the following keys are supported: {", ".join(valid_keys)}'
+
+    def create_backend_index(
+        backend_ops: List[str],
+        symint_ops: Set[str],
+        dispatch_key: DispatchKey,
+        *,
+        use_out_as_primary: bool,
+        use_device_guard: bool,
+    ) -> BackendIndex:
+        metadata: Dict[OperatorName, BackendMetadata] = {}
+        for op in backend_ops:
+            op_name = OperatorName.parse(op)
+            assert (
+                op_name in native_functions_map
+            ), f"Found an invalid operator name: {op_name}"
+            # See Note [External Backends Follow Dispatcher API]
+            kernel_name = dispatcher.name(native_functions_map[op_name].func)
+            if op in symint_ops:
+                kernel_name += "_symint"
+            # TODO: allow structured external backends later.
+            m = BackendMetadata(
+                kernel=kernel_name, structured=False, cpp_namespace=cpp_namespace
+            )
+            metadata[op_name] = m
+        return BackendIndex(
+            dispatch_key=dispatch_key,
+            use_out_as_primary=use_out_as_primary,
+            external=True,
+            device_guard=use_device_guard,
+            index=metadata,
+        )
+
+    backend_key: Optional[DispatchKey] = None
+    if len(supported) > 0:
+        with context(
+            lambda: f'The provided value for "backend" must be a valid DispatchKey, but got {backend}.'
+        ):
+            backend_key = DispatchKey.parse(backend)
+
+        backend_idx = create_backend_index(
+            supported,
+            symint_set,
+            backend_key,
+            use_out_as_primary=use_out_as_primary,
+            use_device_guard=use_device_guard,
+        )
+        assert backend_key not in backend_indices
+        backend_indices[backend_key] = backend_idx
+
+    autograd_key: Optional[DispatchKey] = None
+    if len(supported_autograd) > 0:
+        with context(
+            lambda: f'The "autograd" key was specified, which indicates that you would like to override \
+the behavior of autograd for some operators on your backend. However "Autograd{backend}" is not a valid DispatchKey.'
+        ):
+            autograd_key = DispatchKey.parse(f"Autograd{backend}")
+
+        autograd_idx = create_backend_index(
+            supported_autograd,
+            symint_set,
+            autograd_key,
+            use_out_as_primary=use_out_as_primary,
+            use_device_guard=use_device_guard,
+        )
+        assert autograd_key not in backend_indices
+        backend_indices[autograd_key] = autograd_idx
+
+    for g in grouped_native_functions:
+        if isinstance(g, NativeFunction):
+            forward_kernels = (
+                []
+                if backend_key is None
+                else [
+                    m
+                    for m in [backend_indices[backend_key].get_kernel(g)]
+                    if m is not None
+                ]
+            )
+            backward_kernels = (
+                []
+                if autograd_key is None
+                else [
+                    m
+                    for m in [backend_indices[autograd_key].get_kernel(g)]
+                    if m is not None
+                ]
+            )
+        else:
+            forward_kernels = (
+                []
+                if backend_key is None
+                else [
+                    m
+                    for m in [
+                        backend_indices[backend_key].get_kernel(f)
+                        for f in g.functions()
+                    ]
+                    if m is not None
+                ]
+            )
+            backward_kernels = (
+                []
+                if autograd_key is None
+                else [
+                    m
+                    for m in [
+                        backend_indices[autograd_key].get_kernel(f)
+                        for f in g.functions()
+                    ]
+                    if m is not None
+                ]
+            )
+
+        forward_kernels = [f for f in forward_kernels if f is not None]
+        backward_kernels = [f for f in backward_kernels if f is not None]
+        assert (
+            len(forward_kernels) == 0 or len(backward_kernels) == 0
+        ), f'Currently, all variants of an op must either be registered to a backend key, or to a backend\'s \
+autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! \
+{forward_kernels[0].kernel} is listed under "supported", but {backward_kernels[0].kernel} is listed under "autograd".'
+
+    return ParsedExternalYaml(
+        backend_key, autograd_key, class_name, cpp_namespace, backend_indices
+    )
+
+
+def error_on_missing_kernels(
+    native_functions: Sequence[NativeFunction],
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    backend_key: DispatchKey,
+    autograd_key: Optional[DispatchKey],
+    class_name: str,
+    kernel_defn_file_path: str,
+    full_codegen: Optional[List[OperatorName]] = None,
+) -> None:
+    try:
+        with open(kernel_defn_file_path) as f:
+            backend_defns = f.read()
+    except OSError as e:
+        raise AssertionError(
+            f"Unable to read from the specified impl_path file: {kernel_defn_file_path}"
+        ) from e
+
+    if full_codegen is None:
+        full_codegen = []
+
+    indices = [backend_indices[backend_key].index] + (
+        [] if autograd_key is None else [backend_indices[autograd_key].index]
+    )
+    # Quick mapping from each OperatorName used by the external backend
+    # to its backend kernel name
+    expected_backend_op_names: Dict[OperatorName, str] = dict(
+        list(
+            concatMap(
+                lambda index: [
+                    (op_name, metadata.kernel) for op_name, metadata in index.items()
+                ],
+                indices,
+            )
+        )
+    )
+    expected_backend_native_funcs: List[NativeFunction] = [
+        f
+        for f in native_functions
+        if f.func.name in expected_backend_op_names.keys()
+        and f.func.name not in full_codegen
+    ]
+    expected_backend_kernel_name_counts: Dict[str, List[NativeFunction]] = defaultdict(
+        list
+    )
+    for native_f in expected_backend_native_funcs:
+        expected_backend_kernel_name_counts[
+            expected_backend_op_names[native_f.func.name]
+        ].append(native_f)
+
+    # This just looks for lines containing "foo(", and assumes that the kernel foo has been implemented.
+    # It might cause false negatives (we won't catch all cases), but that's ok - if we catch a missing kernel
+    # here, then we get a nicer error message. If we miss it, you get a linker error.
+    kernel_defn_regex = rf"(.*){class_name}::\s*([\w\d]*)\("
+    actual_backend_kernel_name_counts = Counter(
+        # A bit unwieldy (this could probably be moved into regex),
+        # but we don't want to include kernel names that come from function calls,
+        # like "return torch_xla::XLANativeFunctions::empty_strided_symint(...)".
+        # Easy check is to ignore any lines with colons before the class name.
+        [
+            y
+            for (x, y) in re.findall(kernel_defn_regex, backend_defns)
+            if not x.endswith(":")
+        ]
+    )
+
+    missing_kernels_err_msg = ""
+    for expected_name, funcs in expected_backend_kernel_name_counts.items():
+        expected_overload_count = len(funcs)
+        actual_overload_count = actual_backend_kernel_name_counts[expected_name]
+        if expected_overload_count != actual_overload_count:
+
+            def create_decl(f: NativeFunction) -> str:
+                with native_function_manager(f):
+                    return DispatcherSignature.from_schema(f.func).decl()
+
+            expected_schemas_str = "\n".join([create_decl(f) for f in funcs])
+            missing_kernels_err_msg += f"""
+{class_name} is missing a kernel definition for {expected_name}. We found {actual_overload_count} kernel(s) with that name,
+but expected {expected_overload_count} kernel(s). The expected function schemas for the missing operator are:
+{expected_schemas_str}
+
+"""
+    assert missing_kernels_err_msg == "", missing_kernels_err_msg
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate backend stub files")
+    parser.add_argument(
+        "-s",
+        "--source-yaml",
+        "--source_yaml",
+        help="path to source yaml file containing operator external definitions",
+    )
+    parser.add_argument("-o", "--output-dir", "--output_dir", help="output directory")
+    parser.add_argument(
+        "--dry-run", "--dry_run", type=bool, default=False, help="output directory"
+    )
+    parser.add_argument(
+        "--impl-path",
+        "--impl_path",
+        type=str,
+        default=None,
+        help="path to the source C++ file containing kernel definitions",
+    )
+    options = parser.parse_args()
+
+    run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path)
+
+
+def gen_dispatchkey_nativefunc_headers(
+    fm: FileManager,
+    class_name: str,
+    cpp_namespace: str,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    backend_dispatch_key: DispatchKey,
+    autograd_dispatch_key: Optional[DispatchKey],
+    backend_name: str = "",
+) -> None:
+    assert class_name is not None
+    generated_comment = (
+        "Autogenerated file by gen_backend_stubs.py. Do not edit directly!"
+    )
+
+    # Convert to a set first to remove duplicate kernel names.
+    # Backends are allowed to repeat kernel names; only generate the declaration once!
+    # Sort for deterministic output.
+    backend_declarations = sorted(
+        set(
+            concatMap(
+                lambda f: dest.compute_native_function_declaration(
+                    f, backend_indices[backend_dispatch_key]
+                ),
+                grouped_native_functions,
+            )
+        )
+    )
+    autograd_declarations = sorted(
+        set(
+            concatMap(
+                lambda f: []
+                if autograd_dispatch_key is None
+                else dest.compute_native_function_declaration(
+                    f, backend_indices[autograd_dispatch_key]
+                ),
+                grouped_native_functions,
+            )
+        )
+    )
+
+    ns_helper = NamespaceHelper(cpp_namespace)
+    fm.write_with_template(
+        f"{backend_dispatch_key}NativeFunctions.h",
+        "DispatchKeyNativeFunctions.h",
+        lambda: {
+            "generated_comment": generated_comment,
+            "namespace_prologue": ns_helper.prologue,
+            "class_name": class_name,
+            "namespace_epilogue": ns_helper.epilogue,
+            "dispatch_declarations": backend_declarations + autograd_declarations,
+            "BackendName": backend_name,
+            "DispatchKey": backend_dispatch_key,
+        },
+    )
+
+
+def gen_dispatcher_registrations(
+    fm: FileManager,
+    output_dir: str,
+    class_name: str,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    backend_dispatch_key: DispatchKey,
+    dispatch_key: DispatchKey,
+    selector: "SelectiveBuilder",
+    # build_in_tree is true for lazy TS backend and affects include paths, not used for external backends
+    build_in_tree: bool = False,
+    per_operator_headers: bool = False,
+    backend_name: str = "",
+    eager_registration: bool = True,
+) -> None:
+    headers = [
+        f"{output_dir}/{backend_dispatch_key}NativeFunctions.h",
+    ]
+    if build_in_tree:
+        external_backend_headers_str = "\n".join(f"#include <{h}>" for h in headers)
+    else:
+        external_backend_headers_str = "\n".join(f'#include "{h}"' for h in headers)
+
+    assert class_name is not None
+    backend_index = backend_indices[dispatch_key]
+
+    dispatch_registrations_body = list(
+        concatMap(
+            dest.RegisterDispatchKey(
+                backend_index,
+                Target.REGISTRATION,
+                selector,
+                rocm=False,
+                symint=True,
+                class_method_name=f"{class_name}",
+                skip_dispatcher_op_registration=False,
+            ),
+            grouped_native_functions,
+        )
+    )
+    newline = "\n"
+    ns_helper = NamespaceHelper(namespace_str="at")
+    deferred_dispatch_registrations = ""
+    static_init_dispatch_registrations = ""
+    if eager_registration:
+        static_template = CodeTemplate(
+            """\
+TORCH_LIBRARY_IMPL(aten, $dispatch_key, m) {
+    $dispatch_registrations_body
+};"""
+        )
+        static_init_dispatch_registrations = static_template.substitute(
+            dispatch_key=dispatch_key,
+            dispatch_registrations_body=dispatch_registrations_body,
+        )
+    else:
+        deferred_template = CodeTemplate(
+            """\
+TORCH_API void Register${backend_name}${dispatch_key}NativeFunctions();
+TORCH_API void Register${backend_name}${dispatch_key}NativeFunctions() {
+    static auto m = MAKE_TORCH_LIBRARY_IMPL(aten, $dispatch_key);
+    $dispatch_registrations_body
+}"""
+        )
+        deferred_dispatch_registrations = deferred_template.substitute(
+            backend_name=backend_name,
+            dispatch_key=dispatch_key,
+            dispatch_registrations_body=dispatch_registrations_body,
+        )
+
+    fm.write_with_template(
+        f"Register{dispatch_key}.cpp",
+        "RegisterDispatchKey.cpp",
+        lambda: {
+            "extra_cuda_headers": "",
+            "external_backend_headers": external_backend_headers_str,
+            "ops_headers": "#include <ATen/Functions.h>"
+            if not per_operator_headers
+            else "",
+            "DispatchKey": dispatch_key,
+            "dispatch_namespace": dispatch_key.lower(),
+            "dispatch_headers": dest.gen_registration_headers(
+                backend_index, per_operator_headers=per_operator_headers, rocm=False
+            ),
+            "dispatch_definitions": fm.substitute_with_template(
+                "RegisterDispatchDefinitions.ini",
+                lambda: {
+                    "ns_prologue": ns_helper.prologue,
+                    "ns_epilogue": ns_helper.epilogue,
+                    "static_init_dispatch_registrations": static_init_dispatch_registrations,
+                    "deferred_dispatch_registrations": deferred_dispatch_registrations,
+                    "dispatch_helpers": dest.gen_registration_helpers(backend_index),
+                    "dispatch_namespace": dispatch_key.lower(),
+                    "dispatch_namespaced_definitions": "",
+                    "dispatch_anonymous_definitions": list(
+                        concatMap(
+                            dest.RegisterDispatchKey(
+                                backend_index,
+                                Target.ANONYMOUS_DEFINITION,
+                                selector,
+                                rocm=False,
+                                symint=True,
+                                class_method_name=f"{class_name}",
+                                skip_dispatcher_op_registration=False,
+                            ),
+                            grouped_native_functions,
+                        )
+                    ),
+                },
+            ).split(newline),
+        },
+    )
+
+
+def run(
+    source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str] = None
+) -> None:
+    # Assumes that this file lives at PYTORCH_ROOT/torchgen/gen_backend_stubs.py
+    pytorch_root = pathlib.Path(__file__).parent.parent.absolute()
+    template_dir = os.path.join(pytorch_root, "aten/src/ATen/templates")
+
+    def make_file_manager(install_dir: str) -> FileManager:
+        return FileManager(
+            install_dir=install_dir, template_dir=template_dir, dry_run=dry_run
+        )
+
+    fm = make_file_manager(output_dir)
+
+    native_yaml_path = os.path.join(
+        pytorch_root, "aten/src/ATen/native/native_functions.yaml"
+    )
+    tags_yaml_path = os.path.join(pytorch_root, "aten/src/ATen/native/tags.yaml")
+    parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path)
+    native_functions, backend_indices = (
+        parsed_yaml.native_functions,
+        parsed_yaml.backend_indices,
+    )
+    grouped_native_functions = get_grouped_native_functions(native_functions)
+    parsed_backend_yaml = parse_backend_yaml(
+        source_yaml, grouped_native_functions, backend_indices
+    )
+    backend_key = parsed_backend_yaml.backend_key
+    autograd_key = parsed_backend_yaml.autograd_key
+    cpp_namespace = parsed_backend_yaml.cpp_namespace
+    class_name = parsed_backend_yaml.class_name
+    backend_indices = parsed_backend_yaml.backend_indices
+
+    selector = SelectiveBuilder.get_nop_selector()
+
+    if backend_key is None:
+        # This could be useful if a backend wants to quickly set up a noop yaml file but doesn't have any kernels ready yet.
+        return
+
+    if class_name is None:
+        # class_name is an optional argument to backend yaml file.
+        # if specified it allows an external backend to override
+        # the name of the class that all generated kernel definitions live under.
+        # if not specified, its value is given as native_function_class_name.
+        class_name = backend_indices[backend_key].native_function_class_name()
+    assert class_name is not None
+
+    if impl_path is not None:
+        error_on_missing_kernels(
+            native_functions,
+            backend_indices,
+            backend_key,
+            autograd_key,
+            class_name,
+            impl_path,
+        )
+
+    gen_dispatchkey_nativefunc_headers(
+        fm,
+        class_name,
+        cpp_namespace,
+        backend_indices,
+        grouped_native_functions,
+        backend_key,
+        autograd_key,
+    )
+
+    for dispatch_key in (
+        [backend_key] if autograd_key is None else [backend_key, autograd_key]
+    ):
+        gen_dispatcher_registrations(
+            fm,
+            output_dir,
+            class_name,
+            backend_indices,
+            grouped_native_functions,
+            backend_key,
+            dispatch_key,
+            selector,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torchgen/gen_executorch.py b/MLPY/Lib/site-packages/torchgen/gen_executorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..343316178071a0836682918fb14e887c643da5d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/gen_executorch.py
@@ -0,0 +1,995 @@
+import argparse
+import os
+import pathlib
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Sequence, TextIO, Tuple, Union
+
+import yaml
+
+# Parse native_functions.yaml into a sequence of NativeFunctions and Backend Indices.
+from torchgen import dest
+from torchgen.api import cpp as aten_cpp
+from torchgen.api.types import CppSignature, CppSignatureGroup, CType, NamedCType
+from torchgen.context import (
+    method_with_native_function,
+    method_with_nested_native_function,
+    with_native_function_and_index,
+)
+from torchgen.executorch.api import et_cpp
+from torchgen.executorch.api.custom_ops import (
+    ComputeNativeFunctionStub,
+    gen_custom_ops_registration,
+)
+from torchgen.executorch.api.types import contextArg, ExecutorchCppSignature
+from torchgen.executorch.api.unboxing import Unboxing
+from torchgen.executorch.model import ETKernelIndex, ETKernelKey, ETParsedYaml
+from torchgen.executorch.parse import ET_FIELDS, parse_et_yaml, parse_et_yaml_struct
+from torchgen.gen import (
+    get_custom_build_selector,
+    get_native_function_declarations,
+    get_native_function_declarations_from_ns_grouped_kernels,
+    get_native_function_schema_registrations,
+    LineLoader,
+    parse_native_yaml,
+)
+from torchgen.model import (
+    BackendIndex,
+    BackendMetadata,
+    DEFAULT_KERNEL_NAMESPACE,
+    DispatchKey,
+    FunctionSchema,
+    Location,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OperatorName,
+    Variant,
+)
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import (
+    context,
+    FileManager,
+    make_file_manager,
+    mapMaybe,
+    NamespaceHelper,
+)
+
+
+def _sig_decl_wrapper(sig: Union[CppSignature, ExecutorchCppSignature]) -> str:
+    """
+    A wrapper function to basically get `sig.decl(include_context=True)`.
+    For ATen kernel, the codegen has no idea about ET contextArg, so we
+    use this wrapper to add it.
+    """
+    if isinstance(sig, ExecutorchCppSignature):
+        return sig.decl()
+
+    returns_type = aten_cpp.returns_type(sig.func.returns).cpp_type()
+    cpp_args = [a.decl() for a in sig.arguments()]
+    cpp_args_str = ", ".join([contextArg.decl()] + cpp_args)
+    sig_decl = f"{returns_type} {sig.name()}({cpp_args_str})"
+    return sig_decl
+
+
+def static_dispatch(
+    sig: Union[CppSignature, ExecutorchCppSignature],
+    f: NativeFunction,
+    backend_indices: List[BackendIndex],
+) -> str:
+    """
+    For a given `NativeFunction`, find out the corresponding native function and dispatch to it. If zero or more than one
+    native function exists, error out. A simplified version of register_dispatch_key.py
+    Arguments:
+        sig: A CppSignature for this native function we want to use.
+        f: NativeFunction to generate static dispatch.
+        backend_indices: All available backends.
+    Return:
+        C++ code to call backend-specific functions, e.g., "return at::native::add(self, other, scale);"
+    """
+    if len(backend_indices) == 0 or f.manual_kernel_registration:
+        return ""
+
+    backends = [b for b in backend_indices if b.has_kernel(f)]
+    static_block = None
+    if len(backends) == 1:
+        backend_metadata = backends[0].get_kernel(f)
+        if backend_metadata:
+            args = ", ".join(a.name for a in sig.arguments())
+            # Here we are assuming there's no difference between CppSignature and NativeSignature for Executorch.
+            static_block = f"return ::{backend_metadata.cpp_namespace}::{backend_metadata.kernel}({args});"
+    else:
+        static_block = f"""
+ET_ASSERT_UNREACHABLE_MSG("The number of native function(s) binding to {f.func.name} is {len(backends)}.");
+    """
+    return f"""
+// {f.namespace}::{f.func}
+TORCH_API inline {_sig_decl_wrapper(sig)} {{
+    {static_block}
+}}
+"""
+
+
+# Generates Functions.h, which provides the functional public C++ API,
+# and the scaffolding to call into the dispatcher from these functions.
+@dataclass(frozen=True)
+class ComputeFunction:
+    static_dispatch_backend_indices: List[BackendIndex]
+
+    selector: SelectiveBuilder
+
+    use_aten_lib: bool
+
+    is_custom_op: Callable[[NativeFunction], bool]
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        is_method_variant = False
+        if not self.selector.is_root_operator(f"{f.namespace}::{f.func.name}"):
+            return None
+
+        if Variant.function not in f.variants and Variant.method in f.variants:
+            is_method_variant = True
+
+        # only valid remaining case is only function is in f.variants
+        elif not (Variant.function in f.variants and Variant.method not in f.variants):
+            raise Exception(
+                f"Can't handle native function {f.func} with the following variant specification {f.variants}."
+            )
+
+        sig: Union[CppSignature, ExecutorchCppSignature] = (
+            CppSignatureGroup.from_native_function(
+                f, method=False, fallback_binding=f.manual_cpp_binding
+            ).most_faithful_signature()
+            if self.use_aten_lib
+            else ExecutorchCppSignature.from_native_function(f)
+        )
+        if self.use_aten_lib and not self.is_custom_op(f):
+            comma = ", "
+
+            if is_method_variant:
+                return f"""
+// {f.namespace}::{f.func}
+TORCH_API inline {_sig_decl_wrapper(sig)} {{
+    return {sig.arguments()[0].name}.{sig.name()}({comma.join(e.name for e in sig.arguments()[1:])});
+}}
+"""
+            else:
+                return f"""
+// {f.namespace}::{f.func}
+TORCH_API inline {_sig_decl_wrapper(sig)} {{
+    return at::{sig.name()}({comma.join(e.name for e in sig.arguments())});
+}}
+"""
+
+        else:
+            return static_dispatch(
+                sig,
+                f,
+                backend_indices=self.static_dispatch_backend_indices,
+            )
+
+
+# Generates RegisterCodegenUnboxedKernels.cpp.
+@dataclass(frozen=True)
+class ComputeCodegenUnboxedKernels:
+    selector: SelectiveBuilder
+
+    use_aten_lib: bool
+
+    @method_with_nested_native_function
+    def __call__(
+        self,
+        unbox_kernel_entry: Tuple[NativeFunction, Tuple[ETKernelKey, BackendMetadata]],
+    ) -> str:
+        f: NativeFunction = unbox_kernel_entry[0]
+        kernel_key: Union[ETKernelKey, List[ETKernelKey]] = unbox_kernel_entry[1][0]
+        kernel_meta: BackendMetadata = unbox_kernel_entry[1][1]
+
+        op_name = f"{f.namespace}::{f.func.name}"
+        if not self.selector.is_root_operator(op_name):
+            return ""
+
+        if not isinstance(kernel_key, list):
+            kernel_key = [kernel_key]
+        used_kernel_keys = self.selector.et_get_selected_kernels(
+            op_name, [k.to_native_string() for k in kernel_key]
+        )
+        if not used_kernel_keys:
+            return ""
+        sig: Union[CppSignature, ExecutorchCppSignature]
+        argument_type_gen: Callable[..., NamedCType]
+        return_type_gen: Callable[..., CType]
+        if self.use_aten_lib:
+            sig = CppSignatureGroup.from_native_function(
+                f, method=False, fallback_binding=f.manual_cpp_binding
+            ).most_faithful_signature()
+            argument_type_gen = aten_cpp.argumenttype_type
+            return_type_gen = aten_cpp.returns_type
+            arguments = sig.arguments()
+            kernel_call = f"torch::executor::{f.namespace}::{sig.name()}"
+        else:
+            sig = ExecutorchCppSignature.from_native_function(f)
+            argument_type_gen = et_cpp.argumenttype_type
+            return_type_gen = et_cpp.returns_type
+            arguments = sig.arguments(include_context=False)
+            kernel_call = f"{kernel_meta.cpp_namespace}::{kernel_meta.kernel}"
+        # parse arguments into C++ code
+        binding_list, code_list = Unboxing(
+            argument_type_gen=argument_type_gen
+        ).convert_arguments(arguments)
+
+        # for each C++ argument, generate the conversion code
+        code_connector = "\n\t"
+        arg_connector = ", "
+
+        args_str = f"{arg_connector.join(e.name for e in binding_list)}"
+        event_tracer_output_logging = ""
+        output_ids = []
+
+        if len(f.func.returns) == 0:
+            if len(f.func.arguments.out) == 0:
+                raise Exception(
+                    f"Can't handle native function {f.func} with no returns and no out yet."
+                )
+            out = f.func.arguments.out[0]
+            return_assignment = f"""stack[{len(binding_list)}] = &{out.name};"""
+            ret_prefix = ""
+            output_ids = [len(binding_list)]
+        else:
+            if len(f.func.arguments.out) == 0:
+                return_assignment = (
+                    f"""*stack[{len(binding_list)}] = EValue(result_);"""
+                )
+                ret_prefix = return_type_gen(f.func.returns).cpp_type() + " result_ = "
+                output_ids = [len(binding_list)]
+            else:
+                return_assignment = ""
+                ret_prefix = ""
+                output_ids = [
+                    len(binding_list) - (i + 1)
+                    for i in reversed(range(len(f.func.arguments.out)))
+                ]
+
+        for output_id in output_ids:
+            event_tracer_output_logging += (
+                f"internal::event_tracer_log_evalue("
+                f"context.internal_event_tracer(), "
+                f"*stack[{output_id}]);\n"
+            )
+
+        newline = "\n    "
+        return "\n".join(
+            [
+                f"""
+Kernel(
+    "{f.namespace}::{f.func.name}",{newline + '"' + (k + '",') if k != 'default' else ''}
+    []({contextArg.defn()}, EValue** stack) {{
+        {code_connector.join(code_list)}
+
+        internal::EventTracerProfileScope event_tracer_scope(context.internal_event_tracer(), "native_call_{f.func.name}");
+        EXECUTORCH_SCOPE_PROF("native_call_{f.func.name}");
+        {ret_prefix}{kernel_call}(context, {args_str});
+        {event_tracer_output_logging}
+        {return_assignment}
+    }}
+),
+"""
+                for k in used_kernel_keys
+            ]
+        )
+
+
+def gen_unboxing(
+    *,
+    native_functions: Sequence[NativeFunction],
+    cpu_fm: FileManager,
+    selector: SelectiveBuilder,
+    use_aten_lib: bool,
+    kernel_index: ETKernelIndex,
+    manual_registration: bool,
+) -> None:
+    # Iterable type for write_sharded is a Tuple of (native_function, (kernel_key, metadata))
+    def key_func(
+        item: Tuple[NativeFunction, Tuple[ETKernelKey, BackendMetadata]]
+    ) -> str:
+        return item[0].root_name + ":" + item[1][0].to_native_string()
+
+    items: List[Tuple[NativeFunction, Tuple[ETKernelKey, BackendMetadata]]] = [
+        (native_function, (kernel_key, metadata))
+        for native_function in native_functions
+        for kernel_key, metadata in kernel_index.get_kernels(native_function).items()
+    ]
+
+    header = ["Functions.h" if use_aten_lib else "NativeFunctions.h"]
+    filename = (
+        "RegisterKernels.cpp"
+        if manual_registration
+        else "RegisterCodegenUnboxedKernels.cpp"
+    )
+    cpu_fm.write_sharded(
+        filename,
+        items,
+        key_fn=key_func,
+        env_callable=lambda unbox_kernel_entry: {
+            "unboxed_kernels": [
+                ComputeCodegenUnboxedKernels(selector, use_aten_lib)(unbox_kernel_entry)
+            ],
+            "fn_header": header
+            if unbox_kernel_entry == items[0]
+            else [],  # Only write header once
+        },
+        num_shards=1,
+        sharded_keys={"unboxed_kernels", "fn_header"},
+    )
+
+
+@with_native_function_and_index  # type: ignore[arg-type]
+def compute_native_function_declaration(
+    g: Union[NativeFunctionsGroup, NativeFunction], kernel_index: ETKernelIndex
+) -> List[str]:
+    assert isinstance(g, NativeFunction)
+    sig = ExecutorchCppSignature.from_native_function(f=g)
+    metadata_list = kernel_index.get_kernels(g).values()
+    if metadata_list is None:
+        return []
+    prefix = "TORCH_API"
+
+    # for kernels in lean mode, we declare two versions, one with context and one without.
+    # In the end we will cleanup the unused one.
+    def gen_decl(metadata: BackendMetadata, include_context: bool) -> str:
+        return f"{prefix} {sig.decl(name=metadata.kernel, include_context=include_context)};"
+
+    return [
+        gen_decl(metadata, include_context)
+        for include_context in [False, True]
+        for metadata in metadata_list
+    ]
+
+
+def gen_functions_declarations(
+    *,
+    native_functions: Sequence[NativeFunction],
+    kernel_index: ETKernelIndex,
+    selector: SelectiveBuilder,
+    use_aten_lib: bool,
+    custom_ops_native_functions: Optional[Sequence[NativeFunction]] = None,
+) -> str:
+    """
+    Generates namespace separated C++ function API inline declaration/definitions.
+    Native functions are grouped by namespaces and the generated code is wrapped inside
+    namespace blocks.
+
+    E.g., for `custom_1::foo.out` in yaml file we will generate a C++ API as a symbol
+    in `torch::executor::custom_1::foo_out`. This way we avoid symbol conflict when
+    the other `custom_2::foo.out` is available.
+    """
+
+    # convert kernel index to BackendIndex. This is because we can't handle ETKernelIndex yet.
+    # TODO larryliu: evaluate if this code is still needed. If yes let it handle ETKernelIndex.
+
+    dispatch_key = DispatchKey.CPU
+    backend_index = kernel_index._to_backend_index()
+
+    ns_grouped_functions = defaultdict(list)
+    for native_function in native_functions:
+        ns_grouped_functions[native_function.namespace].append(native_function)
+    functions_declarations = ""
+    newline = "\n"
+    for namespace in ns_grouped_functions:
+        ns_helper = NamespaceHelper(
+            namespace_str=namespace,
+            entity_name="",
+            max_level=3,
+        )
+        declarations = list(
+            mapMaybe(
+                ComputeFunction(
+                    static_dispatch_backend_indices=[backend_index],
+                    selector=selector,
+                    use_aten_lib=use_aten_lib,
+                    is_custom_op=lambda f: custom_ops_native_functions is not None
+                    and f in custom_ops_native_functions,
+                ),
+                ns_grouped_functions[namespace],
+            )
+        )
+        functions_declarations += f"""
+{ns_helper.prologue}
+{newline.join(declarations)}
+{ns_helper.epilogue}
+        """
+    return functions_declarations
+
+
+def get_ns_grouped_kernels(
+    *,
+    native_functions: Sequence[NativeFunction],
+    kernel_index: ETKernelIndex,
+    native_function_decl_gen: Callable[
+        [
+            Union[NativeFunctionsGroup, NativeFunction],
+            ETKernelIndex,
+        ],
+        List[str],
+    ],
+) -> Dict[str, List[str]]:
+    ns_grouped_kernels: Dict[str, List[str]] = defaultdict(list)
+    for f in native_functions:
+        native_function_namespaces = set()
+        op_kernels = kernel_index.get_kernels(f)
+        for backend_metadata in op_kernels.values():
+            if backend_metadata:
+                namespace = backend_metadata.cpp_namespace
+                native_function_namespaces.add(namespace)
+            else:
+                namespace = DEFAULT_KERNEL_NAMESPACE
+            assert (
+                len(native_function_namespaces) <= 1
+            ), f"Codegen only supports one namespace per operator, got {native_function_namespaces}"
+            ns_grouped_kernels[namespace].extend(
+                native_function_decl_gen(f, kernel_index)
+            )
+    return ns_grouped_kernels
+
+
+def gen_headers(
+    *,
+    native_functions: Sequence[NativeFunction],
+    gen_custom_ops_header: bool,
+    custom_ops_native_functions: Sequence[NativeFunction],
+    selector: SelectiveBuilder,
+    kernel_index: ETKernelIndex,
+    cpu_fm: FileManager,
+    use_aten_lib: bool,
+) -> None:
+    """Generate headers.
+
+    Args:
+        native_functions (Sequence[NativeFunction]): a collection of NativeFunction for ATen ops.
+        gen_custom_ops_header (bool): whether we should generate CustomOpsNativeFunctions.h
+        custom_ops_native_functions (Sequence[NativeFunction]): a collection of NativeFunction for custom ops.
+        kernel_index (ETKernelIndex): kernel collection
+        cpu_fm (FileManager): file manager manages output stream
+        use_aten_lib (bool): whether we are generating for PyTorch types or Executorch types.
+    """
+    aten_headers = ["#include <ATen/Functions.h>"]
+    backend_indices = {DispatchKey.CPU: kernel_index._to_backend_index()}
+    if gen_custom_ops_header:
+        cpu_fm.write_with_template(
+            "CustomOpsNativeFunctions.h",
+            "NativeFunctions.h",
+            lambda: {
+                "nativeFunctions_declarations": get_native_function_declarations(
+                    grouped_native_functions=custom_ops_native_functions,
+                    backend_indices=backend_indices,
+                    native_function_decl_gen=dest.compute_native_function_declaration,
+                ),
+                "headers": [
+                    "#include <ATen/ATen.h>",
+                    "#include <torch/torch.h>",
+                ],
+            },
+        )
+        aten_headers.append('#include "CustomOpsNativeFunctions.h"')
+    cpu_fm.write(
+        "Functions.h",
+        lambda: {
+            "static_dispatch_extra_headers": aten_headers
+            if use_aten_lib
+            else ['#include "NativeFunctions.h"'],
+            "Functions_declarations": gen_functions_declarations(
+                native_functions=native_functions,
+                kernel_index=kernel_index,
+                selector=selector,
+                use_aten_lib=use_aten_lib,
+                custom_ops_native_functions=custom_ops_native_functions,
+            ),
+        },
+    )
+    cpu_fm.write(
+        "RegisterKernels.h",
+        lambda: {
+            "generated_comment": "@" + "generated by torchgen/gen_executorch.py",
+        },
+    )
+    headers = {
+        "headers": [
+            "#include <executorch/runtime/core/exec_aten/exec_aten.h> // at::Tensor etc.",
+            "#include <executorch/codegen/macros.h> // TORCH_API",
+            "#include <executorch/runtime/kernel/kernel_runtime_context.h>",
+        ],
+    }
+    if use_aten_lib:
+        cpu_fm.write(
+            "NativeFunctions.h",
+            lambda: dict(
+                {
+                    "nativeFunctions_declarations": get_native_function_declarations(
+                        grouped_native_functions=native_functions,
+                        backend_indices=backend_indices,
+                        native_function_decl_gen=dest.compute_native_function_declaration,
+                    ),
+                },
+                **headers,
+            ),
+        )
+    else:
+        ns_grouped_kernels = get_ns_grouped_kernels(
+            native_functions=native_functions,
+            kernel_index=kernel_index,
+            native_function_decl_gen=compute_native_function_declaration,  # type: ignore[arg-type]
+        )
+        cpu_fm.write(
+            "NativeFunctions.h",
+            lambda: dict(
+                {
+                    "nativeFunctions_declarations": get_native_function_declarations_from_ns_grouped_kernels(
+                        ns_grouped_kernels=ns_grouped_kernels,
+                    ),
+                },
+                **headers,
+            ),
+        )
+
+
+def gen_custom_ops(
+    *,
+    native_functions: Sequence[NativeFunction],
+    selector: SelectiveBuilder,
+    kernel_index: ETKernelIndex,
+    cpu_fm: FileManager,
+    rocm: bool,
+) -> None:
+    dispatch_key = DispatchKey.CPU
+    (
+        anonymous_definition,
+        static_init_dispatch_registrations,
+    ) = gen_custom_ops_registration(
+        native_functions=native_functions,
+        selector=selector,
+        kernel_index=kernel_index,
+        rocm=rocm,
+    )
+    cpu_fm.write_with_template(
+        f"Register{dispatch_key}CustomOps.cpp",
+        "RegisterDispatchKeyCustomOps.cpp",
+        lambda: {
+            "ops_headers": '#include "CustomOpsNativeFunctions.h"',
+            "DispatchKey": dispatch_key,
+            "dispatch_namespace": dispatch_key.lower(),
+            "dispatch_namespaced_definitions": "",
+            "dispatch_anonymous_definitions": anonymous_definition,
+            "static_init_dispatch_registrations": static_init_dispatch_registrations,
+        },
+    )
+    cpu_fm.write_with_template(
+        f"Register{dispatch_key}Stub.cpp",
+        "RegisterDispatchKeyCustomOps.cpp",
+        lambda: {
+            "ops_headers": "",
+            "DispatchKey": dispatch_key,
+            "dispatch_namespace": dispatch_key.lower(),
+            "dispatch_namespaced_definitions": "",
+            "dispatch_anonymous_definitions": list(
+                mapMaybe(ComputeNativeFunctionStub(), native_functions)
+            ),
+            "static_init_dispatch_registrations": static_init_dispatch_registrations,
+        },
+    )
+
+    (
+        aten_schema_registrations,
+        schema_registrations,
+    ) = get_native_function_schema_registrations(
+        native_functions=native_functions,
+        schema_selector=selector,
+    )
+    cpu_fm.write(
+        "RegisterSchema.cpp",
+        lambda: {
+            "schema_registrations": schema_registrations,
+            "aten_schema_registrations": aten_schema_registrations,
+        },
+    )
+
+
+def translate_native_yaml(
+    tags_yaml_path: str,
+    aten_yaml_path: str,
+    native_yaml_path: Optional[str],
+    use_aten_lib: bool,
+    out_file: TextIO,
+) -> None:
+    """Translates Executorch DSL dialect to use the same syntax as
+    native_functions.yaml. The major difference is that Executorch DSL dialect
+    supports "op" key, where it refers to the operator name in native_functions.yaml.
+
+    For example, a functions.yaml may have the following entry:
+
+    - op: add.out
+      ...
+
+    It needs to be translated to the following:
+
+    - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+      ...
+
+    We go in aten_yaml_path and find the operator schema for "add.out" and add it
+    to the original functions.yaml. We also add required field "variants", where for
+    Executorch it will always be "function".
+
+    For ATen mode we don't have to do the translation because native_yaml_path is
+    the same as native_functions.yaml.
+
+    Args:
+        tags_yaml_path: Path to a tags.yaml file to satisfy codegen parsing.
+            It is not optional.
+        aten_yaml_path: Path to ATen operator yaml file native_functions.yaml.
+        native_yaml_path: Path to a functions.yaml file to parse.
+            If the path does not exist in the filesystem, it is treated as an
+            empty file. If `custom_ops_yaml_path` exists, the contents of that
+            file are appended to the yaml input to be parsed.
+        use_aten_lib: We use this flag to determine if we want to generate native
+            functions. In ATen mode we should generate out= variants.
+        out_file: The IO object that we are writing into.
+    Returns:
+        None
+    """
+    if use_aten_lib:
+        with open(aten_yaml_path) as aten_yaml:
+            out_file.writelines(aten_yaml.readlines())
+        return
+
+    native_functions, persisted_fields = parse_et_yaml(
+        aten_yaml_path,
+        tags_yaml_path,
+        None,
+        skip_native_fns_gen=False,
+    )
+
+    func_to_scoped_name: Dict[FunctionSchema, str] = {
+        f.func: f"{f.namespace}::{f.func.name}" for f in native_functions
+    }
+    op_to_scoped_name: Dict[OperatorName, str] = {
+        func.name: name for func, name in func_to_scoped_name.items()
+    }
+
+    schema_dict = {name: str(func) for func, name in func_to_scoped_name.items()}
+    kernel_persist_dict: Dict[str, Dict[str, Any]] = {
+        op_to_scoped_name[op]: v for op, v in persisted_fields.items()
+    }
+
+    if (
+        not native_yaml_path
+        or not os.path.exists(native_yaml_path)
+        or os.stat(native_yaml_path).st_size == 0
+    ):
+        return
+    with open(native_yaml_path) as native_yaml:
+        native_es = yaml.load(native_yaml, Loader=LineLoader)
+        if not native_es:
+            return
+        for e in native_es:
+            assert isinstance(e.get("__line__"), int), e
+            loc = Location(native_yaml_path, e.pop("__line__"))
+            with context(lambda: f"in {loc}:\n  "):
+                if "variants" not in e:
+                    e["variants"] = "function"
+                if "func" in e:
+                    continue
+                assert isinstance(e.get("op"), str), e
+                opname = e.pop("op")
+                if "::" not in opname:
+                    opname = "aten::" + opname
+                assert opname in schema_dict
+                e["func"] = schema_dict.get(opname)
+
+                # Write out persisted kernel information
+                if opname in kernel_persist_dict:
+                    for k, v in kernel_persist_dict[opname].items():
+                        e[k] = v
+
+        yaml.dump(native_es, out_file, width=1000)
+
+
+def parse_yaml(
+    path: Optional[str],
+    tags_yaml_path: str,
+    function_filter: Callable[[NativeFunction], bool],
+    skip_native_fns_gen: bool = False,
+) -> Tuple[
+    List[NativeFunction],
+    Union[Dict[DispatchKey, Dict[OperatorName, BackendMetadata]], ETKernelIndex],
+]:
+    if path and os.path.exists(path) and os.stat(path).st_size > 0:
+        with open(path) as f:
+            es = yaml.load(f, Loader=LineLoader)
+
+        # Check for kernel index structure
+        kernel_index = (
+            parse_et_yaml_struct(es) if any("kernels" in e for e in es) else None
+        )
+
+        # Remove ET specific fields from entries for BC compatibility
+        for entry in es:
+            for field in ET_FIELDS:
+                entry.pop(field, None)
+
+        parsed_yaml = parse_native_yaml(
+            path,
+            tags_yaml_path,
+            None,
+            skip_native_fns_gen=skip_native_fns_gen,
+            loaded_yaml=es,
+        )
+        native_functions = list(filter(function_filter, parsed_yaml.native_functions))
+        op_names = [f.func.name for f in native_functions]
+
+        # (1) Return ETKernelIndex if kernel index is present
+        if kernel_index is not None:
+            filtered_index = {
+                op_name: kernel_mapping
+                for op_name, kernel_mapping in kernel_index.index.items()
+                if op_name in op_names
+            }
+            return native_functions, ETKernelIndex(index=filtered_index)
+
+        # (2) Return BackendIndices if kernel index is absent
+        def map_index(
+            m: Dict[OperatorName, BackendMetadata]
+        ) -> Dict[OperatorName, BackendMetadata]:
+            return {op: m[op] for op in m if op in op_names}
+
+        backend_indices = {
+            k: map_index(b.index) for (k, b) in parsed_yaml.backend_indices.items()
+        }
+
+        return native_functions, backend_indices
+    else:
+        return [], {}
+
+
+def parse_yaml_files(
+    tags_yaml_path: str,
+    aten_yaml_path: str,
+    native_yaml_path: Optional[str],
+    custom_ops_yaml_path: Optional[str],
+    selector: SelectiveBuilder,
+    use_aten_lib: bool,
+) -> Tuple[ETParsedYaml, Optional[ETParsedYaml]]:
+    """Parses functions.yaml and custom_ops.yaml files.
+
+    Args:
+        tags_yaml_path: Path to a tags.yaml file to satisfy codegen parsing.
+            It is not optional.
+        aten_yaml_path: Path to ATen operator yaml file native_functions.yaml.
+        native_yaml_path: Path to a functions.yaml file to parse.
+            If the path does not exist in the filesystem, it is treated as an
+            empty file. If `custom_ops_yaml_path` exists, the contents of that
+            file are appended to the yaml input to be parsed.
+        custom_ops_yaml_path: Path to a custom_ops.yaml file to parse. If
+            the path does not exist in the filesystem, it is ignored.
+        selector: For selective build.
+        use_aten_lib: We use this flag to determine if we want to generate native
+            functions. In ATen mode we should generate out= variants.
+    Returns:
+        A tuple with two elements:
+        [0]: The parsed results of concatenating the contents of
+             `native_yaml_path` and `custom_ops_yaml_path`.
+        [1]: The parsed results of the contents of `custom_ops_yaml_path`, if
+             present. If not present, None.
+    """
+    import tempfile
+
+    # only include selected ops, this is because we want to avoid
+    def function_filter(f: NativeFunction) -> bool:
+        return selector.is_native_function_selected(f)
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        translated_yaml_path = os.path.join(tmpdirname, "translated.yaml")
+        with open(translated_yaml_path, "w") as translated:
+            translate_native_yaml(
+                tags_yaml_path,
+                aten_yaml_path,
+                native_yaml_path,
+                use_aten_lib,
+                translated,
+            )
+
+        translated_functions, translated_indices = parse_yaml(
+            translated_yaml_path, tags_yaml_path, function_filter, not use_aten_lib
+        )
+        custom_ops_functions, custom_ops_indices = parse_yaml(
+            custom_ops_yaml_path, tags_yaml_path, function_filter, True
+        )
+
+        # Convert BackendIndices to ETKernelIndex
+        if not isinstance(translated_indices, ETKernelIndex):
+            translated_indices = ETKernelIndex.from_backend_indices(translated_indices)
+        if not isinstance(custom_ops_indices, ETKernelIndex):
+            custom_ops_indices = ETKernelIndex.from_backend_indices(custom_ops_indices)
+
+        combined_functions = translated_functions + custom_ops_functions
+        combined_kernel_index = ETKernelIndex.merge_indices(
+            translated_indices, custom_ops_indices
+        )
+        combined_yaml = ETParsedYaml(combined_functions, combined_kernel_index)
+        custom_ops_parsed_yaml = ETParsedYaml(custom_ops_functions, custom_ops_indices)
+
+    return combined_yaml, custom_ops_parsed_yaml
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate operator source files")
+    # Although we don't refer to --source-path directly, make_file_manager()
+    # expects it to point to a directory that contains a templates/ subdirectory
+    # containing the file templates.
+    parser.add_argument(
+        "-s",
+        "--source-path",
+        help="path to source directory for kernel templates",
+    )
+    parser.add_argument(
+        "--functions-yaml-path",
+        "--functions_yaml_path",
+        help="path to the functions.yaml file to use. Optional, but at least "
+        "one of --functions-yaml-path and --custom-ops-yaml-path must be "
+        "specified.",
+    )
+    parser.add_argument(
+        "--custom-ops-yaml-path",
+        "--custom_ops_yaml_path",
+        help="path to the custom_ops.yaml file to use. Optional, but at least "
+        "one of --functions-yaml-path and --custom-ops-yaml-path must be "
+        "specified.",
+    )
+    parser.add_argument(
+        "--aten-yaml-path",
+        "--aten_yaml_path",
+        help="path to native_functions.yaml file.",
+    )
+    # Note that make_file_manager() also looks at --install-dir.
+    parser.add_argument(
+        "-d",
+        "--install-dir",
+        "--install_dir",
+        help="output directory",
+        default="build/generated",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-dependencies",
+        help="output a list of dependencies into the given file and exit",
+    )
+    # Although we don't refer to --dry-run directly, make_file_manager() looks
+    # for it.
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="run without writing any files (still updates outputs)",
+    )
+    parser.add_argument(
+        "--static-dispatch-backend",
+        "--static_dispatch_backend",
+        nargs="*",
+        help="generate static dispatch code for the specific backend (if set)",
+    )
+    parser.add_argument(
+        "--op-registration-whitelist",
+        "--op_registration_whitelist",
+        nargs="*",
+        help="filter op registrations by the whitelist (if set); "
+        "each item is `namespace`::`operator name` without overload name; "
+        "e.g.: aten::empty aten::conv2d ...",
+    )
+    parser.add_argument(
+        "--op-selection-yaml-path",
+        "--op_selection_yaml_path",
+        help="Provide a path to the operator selection (for custom build) YAML "
+        "that contains the information about the set of selected operators "
+        "and their categories (training, ...). Each operator is either a "
+        "full operator name with overload or just a bare operator name. "
+        "The operator names also contain the namespace prefix (e.g. aten::)",
+    )
+    parser.add_argument(
+        "--tags-path",
+        help="Path to tags.yaml. Required by yaml parsing in codegen system.",
+    )
+    parser.add_argument(
+        "--rocm",
+        action="store_true",
+        help="reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly",
+    )
+    parser.add_argument(
+        "--use-aten-lib",
+        "--use_aten_lib",
+        action="store_true",
+        help="a boolean flag to indicate whether we use ATen kernels or not, in the future this flag will be per "
+        "operator",
+    )
+    parser.add_argument(
+        "--manual_registration",
+        "--manual-registration",
+        action="store_true",
+        help="a boolean flag to indicate whether we want to manually call"
+        "register_kernels() or rely on static init. ",
+    )
+    parser.add_argument(
+        "--generate",
+        type=str,
+        nargs="*",
+        choices=["headers", "sources"],
+        default=["headers", "sources"],
+        help="Generate only a subset of files",
+    )
+    options = parser.parse_args()
+    assert options.tags_path, "tags.yaml is required by codegen yaml parsing."
+
+    selector = get_custom_build_selector(
+        options.op_registration_whitelist,
+        options.op_selection_yaml_path,
+    )
+
+    parsed_yaml, custom_ops_parsed_yaml = parse_yaml_files(
+        aten_yaml_path=options.aten_yaml_path,
+        tags_yaml_path=options.tags_path,
+        native_yaml_path=options.functions_yaml_path,
+        custom_ops_yaml_path=options.custom_ops_yaml_path,
+        selector=selector,
+        use_aten_lib=options.use_aten_lib,
+    )
+    native_functions, kernel_index = (
+        parsed_yaml.native_functions,
+        parsed_yaml.kernel_index,
+    )
+    custom_ops_native_functions = (
+        custom_ops_parsed_yaml.native_functions if custom_ops_parsed_yaml else []
+    )
+
+    cpu_fm = make_file_manager(options=options)
+
+    if "headers" in options.generate:
+        # generate CustomOpsNativeFunctions.h when custom_ops.yaml is present, to match the build system.
+        gen_headers(
+            native_functions=native_functions,
+            gen_custom_ops_header=options.custom_ops_yaml_path,
+            custom_ops_native_functions=custom_ops_native_functions,
+            selector=selector,
+            kernel_index=kernel_index,
+            cpu_fm=cpu_fm,
+            use_aten_lib=options.use_aten_lib,
+        )
+
+    if "sources" in options.generate:
+        gen_unboxing(
+            native_functions=native_functions,
+            cpu_fm=cpu_fm,
+            selector=selector,
+            use_aten_lib=options.use_aten_lib,
+            kernel_index=kernel_index,
+            manual_registration=options.manual_registration,
+        )
+        if custom_ops_native_functions:
+            gen_custom_ops(
+                native_functions=custom_ops_native_functions,
+                selector=selector,
+                kernel_index=kernel_index,
+                cpu_fm=cpu_fm,
+                rocm=options.rocm,
+            )
+
+    if options.output_dependencies:
+        depfile_path = pathlib.Path(options.output_dependencies).resolve()
+        depfile_name = depfile_path.name
+        depfile_stem = depfile_path.stem
+
+        for fm, prefix in [
+            (cpu_fm, ""),
+        ]:
+            varname = prefix + depfile_stem
+            path = depfile_path.parent / (prefix + depfile_name)
+            fm.write_outputs(varname, str(path))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torchgen/gen_functionalization_type.py b/MLPY/Lib/site-packages/torchgen/gen_functionalization_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..d79f270a48e71f803829ce7dd30fb5541f47c853
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/gen_functionalization_type.py
@@ -0,0 +1,809 @@
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Tuple, Union
+
+from torchgen.api import cpp, dispatcher
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    CType,
+    DispatcherSignature,
+    FunctionalizationLambda,
+    iTensorListRefT,
+    NativeSignature,
+    tensorListT,
+    tensorT,
+    VectorCType,
+    ViewInverseSignature,
+)
+from torchgen.context import (
+    method_with_native_function,
+    native_function_manager,
+    with_native_function,
+    with_native_function_and,
+)
+from torchgen.model import (
+    Argument,
+    BackendIndex,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    NativeFunctionsGroup,
+    NativeFunctionsViewGroup,
+    Return,
+    SchemaKind,
+    SelfArgument,
+    TensorOptionsArguments,
+)
+from torchgen.native_function_generation import (
+    INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY,
+    MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT,
+    OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY,
+)
+
+from torchgen.selective_build.selector import SelectiveBuilder
+
+
+# Note: [Mutable Ops Not Using Functionalization]
+# Ops in this list currently do not work with functionalization and should be fixed.
+MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION = (
+    OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY
+    + MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT
+    + INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY
+    + [
+        # It will be BC-breaking, but we should fix their schemas.
+        # should be inplace?
+        "record_stream",
+        # See Note [resize_ in Functionalization]
+        "resize_",
+        "resize_as_",
+        # This function is used as for testing purposes only.
+        "_fill_mem_eff_dropout_mask_",
+    ]
+)
+
+# This file contains codegen that relates to the functionalization pass.
+# It includes:
+# - gen_functionalization_definition
+#     Generates dispatcher kernel definitions for the functionalization pass.
+# - gen_functionalization_registration
+#     Generates dispatcher kernel registrations for the functionalization pass.
+# - gen_functionalization_view_inverse_declaration
+#     Generates a declaration for an "inverse view", for every view op
+#     that is needed in functionalization. We manually implement their definitions.
+# - gen_composite_view_copy_kernel
+#     Generates view_copy() composite kernels for all view_copy operators.
+
+
+# Generates the body of the default composite C++ kernel for a {view}_copy NativeFunction
+# See Note [view_copy NativeFunctions]
+@dataclass(frozen=True)
+class GenCompositeViewCopyKernel:
+    backend_index: BackendIndex
+
+    @method_with_native_function
+    def __call__(self, g: NativeFunctionsViewGroup) -> Optional[str]:
+        if g.view_copy is None:
+            return None
+        elif g.view_copy.func.name.name.base != f"{g.view.func.name.name}_copy":
+            # If the view_copy doesn't match the standard naming scheme of <op>_copy,
+            # assume it already exists and doesn't need to be generated.
+            # Example: slice_inverse() with the copy variant named slice_scatter()
+            # instead of slice_inverse_copy()
+            return None
+
+        metadata = self.backend_index.get_kernel(g.view_copy)
+        assert metadata is not None
+
+        # We can make view_copy work in more cases by using reshape()
+        # when a normal view call would ordinarily fail.
+        # This also makes LTC more efficient, because they don't need to include
+        # clone() calls in their graph (which is normally needed by reshape).
+        if str(g.view_copy.func.name) == "view_copy":
+            assert metadata.kernel == "view_copy_symint"
+            return """\
+at::Tensor view_copy_symint(const at::Tensor & self, at::SymIntArrayRef size) {
+  c10::SymDimVector shape = infer_size_dv(size, self.sym_numel());
+  if (!at::detail::computeStride(self.sym_sizes(), self.sym_strides(), shape).has_value()) {
+    return self.reshape_symint(size);
+  } else {
+    auto output = at::_ops::view::call(self, size);
+    return output.clone(/*memory_format=*/at::MemoryFormat::Contiguous);
+  }
+}
+"""
+        # view_copy is a native signature, since we're generating an at::native:: kernel
+        # Functionalization always operates on symints though
+        view_copy_sig = NativeSignature(
+            g.view_copy.func, symint=metadata.supports_symint()
+        )
+
+        # view is a dispatcher signature, since we're calling into the at::_ops API
+        view_sig = DispatcherSignature(g.view.func)
+
+        view_api_name = g.view.func.name.unambiguous_name()
+        exprs = ", ".join(
+            [e.expr for e in translate(view_copy_sig.arguments(), view_sig.arguments())]
+        )
+
+        # view ops today always return either a Tensor or a list of Tensors
+        assert len(g.view.func.returns) == 1
+        assert g.view.func.returns[0].type == BaseType(
+            BaseTy.Tensor
+        ) or g.view.func.returns[0].type == ListType(BaseType(BaseTy.Tensor), None)
+
+        if g.view.func.returns[0].type == BaseType(BaseTy.Tensor):
+            return_cloned_output = """\
+  return output.clone(/*memory_format=*/at::MemoryFormat::Contiguous);"""
+        else:
+            # If the return type is a list, we need to clone each tensor in the list.
+            return_cloned_output = f"""\
+  {view_copy_sig.returns_type().cpp_type()} out_clone;
+  for (const auto i : c10::irange(output.size())) {{
+    out_clone.push_back(output[i].clone(/*memory_format=*/at::MemoryFormat::Contiguous));
+  }}
+  return out_clone;"""
+
+        # The default generated composite kernel for {view}_copy() operators just clones
+        # the input tensor, and runs the underlying view on the clone.
+        return f"""
+{view_copy_sig.defn(name=metadata.kernel)} {{
+  auto output = at::_ops::{view_api_name}::call({exprs});
+  {return_cloned_output}
+}}
+"""
+
+
+def return_str(rets: Tuple[Return, ...], names: List[str]) -> str:
+    assert len(rets) == len(names)
+    if len(rets) == 0:
+        return ""
+    elif len(rets) == 1:
+        return f"return {names[0]};"
+    else:
+        return f"return {dispatcher.returns_type(rets).cpp_type()}({', '.join(names)});"
+
+
+def modifies_arguments(f: NativeFunction) -> bool:
+    return any(
+        a.annotation is not None and a.annotation.is_write
+        for a in f.func.arguments.flat_all
+    )
+
+
+def wrapper_name(func: FunctionSchema) -> str:
+    if func.name.overload_name:
+        return f"{cpp.name(func)}_{func.name.overload_name}"
+    else:
+        return cpp.name(func)
+
+
+def is_tensor_like(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> bool:
+    return isinstance(a, SelfArgument) or (
+        isinstance(a, Argument) and a.type.is_tensor_like()
+    )
+
+
+# We need to wrap / unwrap various arguments from the op in the functionalization kernels.
+# Some op schemas include non-owning types though (like TensorList),
+# and when we unwrap them we expect to get out an owning type!.
+# We also return a lambda that tells you how to conver the non-owning type argument into the owning type.
+def get_owning_type(t: CType) -> Tuple[CType, Callable[[str], str]]:
+    if t == BaseCType(tensorListT):
+        return VectorCType(BaseCType(tensorT)), lambda x: f"{x}.vec()"
+    if t == BaseCType(iTensorListRefT):
+        return VectorCType(BaseCType(tensorT)), lambda x: f"{{{x}.begin(), {x}.end()}}"
+    # There are technically other non-owning types out there (like IntArrayRef),
+    # but functionalization only actually cares about the ones involving tensors.
+    return t, lambda x: x
+
+
+# unwraps all tensor-like arguments, returning:
+# (1) a string containing all of the logic that does the unwrapping
+# (2) a context, to be used by translate(), with all of the relevant bindings.
+def unwrap_tensor_args(
+    sig: DispatcherSignature, *, is_view_op: bool
+) -> Tuple[str, List[Binding]]:
+    context: List[Binding] = []
+    unwrapped_tensor_args: List[str] = []
+    for arg in sig.arguments():
+        if is_tensor_like(arg.argument):
+            # for tensor inputs, we want to unwrap them before passing them into the redispatch calls.
+            unwrapped_name = f"{arg.name}_"
+            # For most ops, the functionalization needs to sync any pending updates on the input tensors
+            # before calling the operator, since otherwise the operator will act on stale data.
+            # For view ops though, we can continue to defer syncing until the tensor is used by
+            # a non-view operator.
+            maybe_sync_input = (
+                "" if is_view_op else f"at::functionalization::impl::sync({arg.name});"
+            )
+            unwrapped_type, conversion_fn = get_owning_type(
+                arg.nctype.remove_const_ref().type
+            )
+            unwrapped_tensor_args.append(
+                f"""
+      {unwrapped_type.cpp_type()} {unwrapped_name};
+      if (at::functionalization::impl::isFunctionalTensor({arg.name})) {{
+        {maybe_sync_input}
+        {unwrapped_name} = at::functionalization::impl::from_functional_tensor({arg.name});
+      }} else {{
+        {unwrapped_name} = {conversion_fn(arg.name)};
+      }}"""
+            )
+            context.append(arg.with_name(unwrapped_name))
+        else:
+            # for non-tensor inputs, we want to pass them directly into the redispatch calls.
+            context.append(arg)
+    unwrap_tensor_args_str = "\n      ".join(unwrapped_tensor_args)
+    return unwrap_tensor_args_str, context
+
+
+# converts  all tensor-like arguments to meta tensors, which are used to compute stride info. Returns:
+# (1) a string containing all of the logic that does the conversions.
+# (2) a context, to be used by translate(), with all of the relevant bindings.
+def convert_to_meta_tensors(sig: DispatcherSignature) -> Tuple[str, List[Binding]]:
+    context: List[Binding] = []
+    unwrapped_tensor_args: List[str] = []
+    for arg in sig.arguments():
+        if is_tensor_like(arg.argument):
+            # for tensor inputs, we want to unwrap them before passing them into the redispatch calls.
+            a_ = arg.name
+            unwrapped_name = f"{arg.name}_meta"
+            unwrapped_tensor_args.append(f"auto {unwrapped_name} = to_meta({a_});")
+            context.append(arg.with_name(unwrapped_name))
+        else:
+            # for non-tensor inputs, we want to pass them directly into the redispatch calls.
+            context.append(arg)
+    unwrap_tensor_args_str = "\n        ".join(unwrapped_tensor_args)
+    return unwrap_tensor_args_str, context
+
+
+# The functionalization codegen currently expects view op schemas to have this form:
+# foo(Tensor(a), ...) -> Tensor(a) (e.g. transpose)
+# foo(Tensor(a!), ...) -> Tensor(a!) (e.g. transpose_)
+def assert_view_op_properties(func: FunctionSchema) -> None:
+    def is_alias(a: Argument) -> bool:
+        return a.annotation is not None
+
+    args = func.arguments.flat_non_out
+    # The first argument is a tensor with an alias semantics (annotations)
+    assert len(args) > 0 and args[0].type == BaseType(
+        BaseTy.Tensor
+    ), f"""In the functionalization codegen, we expect the first argument of every view operator to be a tensor,
+but found an argument of type {str(args[0].type)} for operator: {str(func.name)}."""
+    # No other arguments have aliasing semantics
+    assert is_alias(args[0]) and not any(
+        is_alias(a) for a in args[1:]
+    ), """In the functionalization codegen, we expect the first argument of every view operator to alias the output.
+View operators with multiple aliasing inputs aren't supported yet. Found an operator that doesn't satisfy this constraint"""
+
+
+# Generates the Functionalization kernel for:
+# - ops that create aliases (e.g. transpose())
+# - ops that are views AND mutations (e.g. transpose_())
+def emit_view_functionalization_body(
+    g: NativeFunctionsViewGroup, *, view_inplace: bool
+) -> str:
+    if view_inplace:
+        # This op is both an inplace op AND a view op.
+        # See Note [Functionalization Pass - Inplace View Ops] for details.
+        # I currently have the view meta call into the out-of-place variant of the view, to avoid
+        # having to define an extra ~20 inplace {view}_inverse_ functions.
+        # Most view ops don't have NativeFunctionGroup's both, because we don't define out= variants for view ops.
+        # I'm assuming that every inplace-view op has a corresponding out-of-place view op,
+        # with the same name but the trailing underscore removed.
+        # This is currently asserted at parse time in gen.py (see error_check_native_functions).
+        assert g.view_inplace is not None
+        f = g.view_inplace
+    else:
+        f = g.view
+
+    assert g.view_copy is not None
+    with native_function_manager(f):
+        call_sig = DispatcherSignature.from_schema(g.view_copy.func)
+
+        # the "view_copy" op name that the functionalization kernels need to call
+        api_name = g.view_copy.func.name.unambiguous_name()
+        # Sometimes the functionalization pass needs to no-op (e.g. if it was passed non-functional tensors)
+        # "no-op"ing in this context is just redispatching to the original op.
+        noop_api_name = f.func.name.unambiguous_name()
+
+        dispatcher_sig = DispatcherSignature.from_schema(f.func)
+        assert_view_op_properties(f.func)
+        view_tensor_name = dispatcher_sig.arguments()[0].name
+
+        return_type = dispatcher_sig.returns_type().remove_const_ref().cpp_type()
+
+        unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args(
+            dispatcher_sig, is_view_op=True
+        )
+        view_redispatch_args = [
+            e.expr
+            for e in translate(unwrapped_args_ctx, call_sig.arguments(), method=False)
+        ]
+
+        forward_lambda = FunctionalizationLambda.from_func(g, is_reverse=False)
+        reverse_lambda = FunctionalizationLambda.from_func(g, is_reverse=True)
+
+        # The meta API call should use the same arguments, but convert all tensors to meta tensors first.
+        meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
+        meta_call_args = [
+            e.expr for e in translate(meta_call_ctx, call_sig.arguments(), method=False)
+        ]
+
+        if "inplace_view" in f.tags:
+            # See Note [Functionalization Pass - Inplace View Ops] for more details
+            return f"""
+    {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
+      if (!at::functionalization::impl::isFunctionalTensor({view_tensor_name})) {{
+        // functionalization is re-entrant, but will no-op if it wasn't passed a FunctionalTensorWrapper.
+        {unwrap_tensor_args_str}
+        at::AutoDispatchSkipFunctionalize guard;
+        return at::_ops::{noop_api_name}::call({', '.join(view_redispatch_args)});
+      }}
+      auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
+      auto inverse_return_mode = (
+          reapply_views ? at::functionalization::InverseReturnMode::ViewOrScatterInverse
+            : at::functionalization::InverseReturnMode::NeverView
+      );
+      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+        {forward_lambda.decl()} {{
+          if (reapply_views) {{
+            return {forward_lambda.inner_call(reapply_views=True)}
+          }} else {{
+            return {forward_lambda.inner_call(reapply_views=False)}
+          }}
+        }},
+        {reverse_lambda.decl()} {{
+          return {reverse_lambda.inner_call()}
+        }}
+      );
+      auto compute_reference_meta =
+        {view_tensor_name}.key_set().has_backend(c10::BackendComponent::XLABit) ||
+        {view_tensor_name}.key_set().has_backend(c10::BackendComponent::LazyBit);
+      {return_type} reference_tensor_output;
+      if (compute_reference_meta) {{
+        {meta_conversion_str}
+        at::AutoDispatchSkipFunctionalize func_guard;
+        c10::impl::ExcludeDispatchKeyGuard guard(exclude_keys_for_meta_dispatch);
+        reference_tensor_output = at::_ops::{noop_api_name}::call({', '.join(meta_call_args)});
+      }}
+      // This function adds the above view meta to the current tensor and replays them off the base,
+      // mutating the size/stride info of the current FunctionalTensorWrapper.
+      // Because of this, we need to make sure to run the reference shape function above,
+      // BEFORE doing this (otherwise we'll end up runnin the reference function using the wrong sizes/strides)
+      at::functionalization::impl::mutate_view_meta({view_tensor_name}, view_meta);
+      // See  Note [Propagating strides in the functionalization pass]
+      // XLA/LTC don't implement the logic to propagate strides correctly, so we need to rely
+      // on a reference implementation here (instead of relying on the output from the forward lambda
+      // having the correct stride info)
+      if (compute_reference_meta) {{
+        at::functionalization::impl::set_sizes_strides_offset({view_tensor_name}, reference_tensor_output);
+      }}
+      return {view_tensor_name};
+    }}
+"""
+
+        else:
+            is_multi_output_view = isinstance(f.func.returns[0].type, ListType)
+            return f"""
+    {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
+      {unwrap_tensor_args_str}
+      if (!at::functionalization::impl::isFunctionalTensor({view_tensor_name})) {{
+        // functionalization is re-entrant, but will no-op if it wasn't passed a FunctionalTensorWrapper.
+        at::AutoDispatchSkipFunctionalize guard;
+        return at::_ops::{noop_api_name}::call({', '.join(view_redispatch_args)});
+      }}
+      auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
+      auto inverse_return_mode = (
+          reapply_views ? at::functionalization::InverseReturnMode::ViewOrScatterInverse
+            : at::functionalization::InverseReturnMode::NeverView
+      );
+      auto compute_reference_meta =
+        {view_tensor_name}.key_set().has_backend(c10::BackendComponent::XLABit) ||
+        {view_tensor_name}.key_set().has_backend(c10::BackendComponent::LazyBit);
+      {return_type} reference_tensor_output;
+      if (compute_reference_meta) {{
+        {meta_conversion_str}
+        at::AutoDispatchSkipFunctionalize func_guard;
+        c10::impl::ExcludeDispatchKeyGuard guard(exclude_keys_for_meta_dispatch);
+        reference_tensor_output = at::_ops::{noop_api_name}::call({', '.join(meta_call_args)});
+      }}
+      {return_type} tmp_output;
+      {{
+        at::AutoDispatchSkipFunctionalize guard;
+        if (reapply_views) {{
+          tmp_output = at::_ops::{noop_api_name}::call({', '.join(view_redispatch_args)});
+        }} else {{
+          tmp_output = at::_ops::{api_name}::call({', '.join(view_redispatch_args)});
+        }}
+      }}
+      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+        {forward_lambda.decl()} {{
+          if (reapply_views) {{
+            return {forward_lambda.inner_call(reapply_views=True)}
+          }} else {{
+            return {forward_lambda.inner_call(reapply_views=False)}
+          }}
+        }},
+        {reverse_lambda.decl()} {{
+          return {reverse_lambda.inner_call()}
+        }},
+        /*is_multi_output=*/{str(is_multi_output_view).lower()}
+      );
+      auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, {view_tensor_name}, view_meta);
+      // See  Note [Propagating strides in the functionalization pass]
+      if (compute_reference_meta) {{
+        at::functionalization::impl::set_sizes_strides_offset(out, reference_tensor_output);
+      }}
+      return out;
+    }}
+"""
+
+
+def maybe_create_output(f: NativeFunction, var_name: str) -> str:
+    if len(f.func.returns) == 0:
+        return ""
+    return_type = dispatcher.returns_type(f.func.returns).remove_const_ref().cpp_type()
+    return f"{return_type} {var_name} = "
+
+
+# Given a NativeFunction, and a variable name corresponding to the output of redispatching on the function,
+# this returns two lists of names, consisting of:
+# - the names of returns corresponding to the original (mutable) inputs of the outer function
+# - the names of returns corresponding to the (immutable) outputs of the inner redispatched function
+def get_mutable_redispatch_return_names(
+    f: NativeFunction, inner_return_var: str
+) -> Tuple[List[str], List[str]]:
+    aliased_returns = []
+    non_aliased_returns = []
+    for i, name in enumerate(f.func.aliased_return_names()):
+        if name is not None:
+            aliased_returns.append(name)
+        else:
+            non_aliased_returns.append(
+                inner_return_var
+                if len(f.func.returns) == 1
+                else f"std::get<{i}>({inner_return_var})"
+            )
+    return aliased_returns, non_aliased_returns
+
+
+# When functionalization "no-op's" and redispatches on a mutable operator, we need to take care so that:
+#  - For fresh outputs, we return the result of the redispatch (without wrapping outputs)
+#  - For outputs that were aliased to inputs, we return the inputs directly (since some of them might have been wrapped)
+def return_from_mutable_noop_redispatch(
+    f: NativeFunction, inner_return_var: str
+) -> str:
+    aliased, non_aliased = get_mutable_redispatch_return_names(f, inner_return_var)
+    # Just get all of the return names, and immediately return them
+    return return_str(f.func.returns, aliased + non_aliased)
+
+
+def wrap_propagate_mutations_and_return(
+    f: NativeFunction, functional_op: NativeFunction, inner_return_var: str
+) -> str:
+    mutable_arg_names = f.func.arguments.mutable_arg_names()
+    (
+        aliased_outer_rets,
+        non_aliased_outer_rets,
+    ) = get_mutable_redispatch_return_names(f, inner_return_var)
+    _, non_aliased_inner_rets = get_mutable_redispatch_return_names(
+        functional_op, inner_return_var
+    )
+    # The outer function may have a mix of aliased and non-aliased outputs,
+    # But the inner functional op that we're transforming to should only have non-aliased outputs
+    assert len(mutable_arg_names) + len(non_aliased_outer_rets) == len(
+        non_aliased_inner_rets
+    )
+
+    # First, take all of the newly created outputs from the inner call and wrap them into functional tensors
+    updates = []
+    non_aliased_wrapped_ret_names = []
+    for i, inner_ret in enumerate(
+        non_aliased_inner_rets[: len(non_aliased_outer_rets)]
+    ):
+        ret_name = f"output_{i}"
+        updates.append(
+            f"""\
+  auto output_{i} = at::functionalization::impl::to_functional_tensor({inner_ret});"""
+        )
+        non_aliased_wrapped_ret_names.append(ret_name)
+
+    # Next, take all of the mutated outputs from the inner call corresponding to mutated inputs,
+    # and propagate the mutations
+    for outer_arg, inner_ret in zip(
+        mutable_arg_names, non_aliased_inner_rets[len(non_aliased_outer_rets) :]
+    ):
+        updates.append(
+            f"""\
+  at::functionalization::impl::propagate_xla_data({outer_arg}, {inner_ret});
+  at::functionalization::impl::replace_({outer_arg}, {inner_ret});
+  at::functionalization::impl::commit_update({outer_arg});
+  at::functionalization::impl::sync({outer_arg});"""
+        )
+
+    # Finally, we return:
+    # - Any mutable arguments that also returns
+    # - Any immutable returns that were created wrapping the output from the inner call
+    returns_str = return_str(
+        f.func.returns, aliased_outer_rets + non_aliased_wrapped_ret_names
+    )
+    updates_str = "\n".join(updates)
+    return f"""\
+{updates_str}
+    {returns_str}"""
+
+
+# Generates the Functionalization kernel for:
+# - mutation ops (inplace and out= ops)
+@with_native_function_and
+def emit_inplace_functionalization_body(
+    f: NativeFunction, g: NativeFunctionsGroup
+) -> str:
+    # mutation case
+    assert modifies_arguments(f)
+
+    dispatcher_sig = DispatcherSignature.from_schema(f.func)
+
+    unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args(
+        dispatcher_sig, is_view_op=False
+    )
+
+    mutated_names = [
+        a.name
+        for a in f.func.arguments.flat_all
+        if a.type.is_tensor_like() and a.annotation is not None
+    ]
+    non_mutated_names = [
+        a.name
+        for a in f.func.arguments.flat_all
+        if a.type.is_tensor_like() and a.annotation is None
+    ]
+    non_mutated_tensor_names = [
+        a.name
+        for a in f.func.arguments.flat_all
+        if a.type == BaseType(BaseTy.Tensor) and a.annotation is None
+    ]
+    # all mutable inputs must be functional tensors in order to participate in functionalization
+    check_all_mutated_args_are_functional = " && ".join(
+        ["true"]
+        + [
+            f"at::functionalization::impl::isFunctionalTensor({a})"
+            for a in mutated_names
+        ]
+    )
+    check_any_non_mutated_args_are_functional = " || ".join(
+        ["false"]
+        + [
+            f"at::functionalization::impl::isFunctionalTensor({a})"
+            for a in non_mutated_names
+        ]
+    )
+
+    check_any_non_mutated_tensors_are_xla = " || ".join(
+        ["false"]
+        + [
+            f"{a}.device().type() == c10::DeviceType::XLA"
+            for a in non_mutated_tensor_names
+        ]
+    )
+    # These are used in the cases where we don't functionalize and redispatch to the inplace op
+    # case 1: we hit an inplace op that doesn't have an out-of-place equivalent
+    # case 2: we hit an inplace ops but our inputs are not functional tensors (in which case our kernel just no-ops)
+    inplace_exprs = [
+        e.expr
+        for e in translate(unwrapped_args_ctx, dispatcher_sig.arguments(), method=False)
+    ]
+
+    # call the out-of-place variant of the op
+    return_type = (
+        dispatcher.returns_type(g.functional.func.returns).remove_const_ref().cpp_type()
+    )
+    functional_sig = DispatcherSignature.from_schema(g.functional.func)
+    functional_exprs = [
+        e.expr
+        for e in translate(unwrapped_args_ctx, functional_sig.arguments(), method=False)
+    ]
+
+    if f.func.is_out_fn():
+        mutable_input_post_processing = "\n".join(
+            [
+                f"""
+      at::functionalization::impl::replace_(
+        {a.name}, {'std::get<' + str(i) + '>(tmp_output)' if len(f.func.returns) > 1 else 'tmp_output'});
+      at::functionalization::impl::commit_update({a.name});"""
+                for (i, a) in enumerate(f.func.arguments.out)
+                if a.annotation and a.annotation.is_write and a.type.is_tensor_like()
+            ]
+        )
+    else:
+        mutable_input_post_processing = "\n".join(
+            [
+                f"""
+      at::functionalization::impl::replace_({a.name}, tmp_output);
+      at::functionalization::impl::commit_update({a.name});"""
+                for a in f.func.arguments.flat_all
+                if a.annotation and a.annotation.is_write and a.type.is_tensor_like()
+            ]
+        )
+
+    meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
+    # We don't want to run the inplace meta func for ops like .set_(), because:
+    # (1) they're unnecessary: inplace meta checks are only useful for ops like add_(),
+    #     where broadcasting will work for the out-of-place case but should fail on the inplace call
+    # (2) They'll also fail without adding extra infra: we'd need to convert the input storage argument
+    #     into a meta storage
+    any_storage_args = any(
+        a.type == BaseType(BaseTy.Storage) for a in f.func.arguments.flat_all
+    )
+
+    return f"""
+    {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
+      if ({str(not any_storage_args and f.func.kind() == SchemaKind.inplace).lower()}) {{
+        // Before converting the mutable op to its functional variant, run meta tensors through the original op.
+        // This will help us catch shape errors that apply to inplace ops that wouldn't apply to their functional variants.
+        // (We can only do this for inplace ops today though, because they technically all support meta tensors).
+        {meta_conversion_str}
+        at::AutoDispatchSkipFunctionalize func_guard;
+        c10::impl::ExcludeDispatchKeyGuard guard(exclude_keys_for_meta_dispatch);
+        at::_ops::{f.func.name.unambiguous_name()}::call({', '.join(a.name for a in meta_call_ctx)});
+      }}
+      {unwrap_tensor_args_str}
+      if (!({check_all_mutated_args_are_functional})) {{
+        // We want to disable this check if there are any XLA tensors.
+        // cpu_tensor.copy_(xla_tensor) is valid code.
+        if (!({check_any_non_mutated_tensors_are_xla}) && ({check_any_non_mutated_args_are_functional})) {{
+         // case 1: trying to mutate a non functional tensor with a functional tensor is an error
+         TORCH_INTERNAL_ASSERT(false,
+           "mutating a non-functional tensor with a functional tensor is not allowed.",
+           " Please ensure that all of your inputs are wrapped inside of a functionalize() call.");
+        }} else {{
+         // case 2: arguments are not functional tensors, so we no-op and redispatch.
+         at::AutoDispatchSkipFunctionalize guard;
+         {maybe_create_output(f, 'tmp_output')}at::_ops::{f.func.name.unambiguous_name()}::call({', '.join(inplace_exprs)});
+         {return_from_mutable_noop_redispatch(f, 'tmp_output')}
+        }}
+      }} else {{
+        {return_type} tmp_output;
+        {{
+          at::AutoDispatchSkipFunctionalize guard;
+          tmp_output = at::_ops::{g.functional.func.name.unambiguous_name()}::call({', '.join(functional_exprs)});
+        }}
+        {wrap_propagate_mutations_and_return(f, g.functional, 'tmp_output')}
+      }}
+    }}"""
+
+
+# The below functions generate RegisterFunctionalization.cpp
+# These files provide the kernels that run the functionalization pass, which can be opted into
+# per backend (e.g. XLA or Vulkan), or as a composable transform (functionalize() in functorch).
+
+
+# See Note [Functionalization Pass: View Inverses].
+def gen_functionalization_view_inverse_declaration(
+    selector: SelectiveBuilder, g: NativeFunctionsViewGroup
+) -> Optional[str]:
+    # For every (non-composite) view op, we need a corresponding "inverse view" function.
+    # This generates the declarations so we get a good compiler error when someone adds a new view.
+    @with_native_function
+    def emit_decl_helper(g: NativeFunctionsViewGroup) -> Optional[str]:
+        if g.view.has_composite_implicit_autograd_kernel:
+            return None
+        view_inverse_sig = ViewInverseSignature(g)
+        return view_inverse_sig.decl()
+
+    return emit_decl_helper(g)
+
+
+def gen_functionalization_registration(
+    selector: SelectiveBuilder,
+    g: Union[NativeFunction, NativeFunctionsGroup, NativeFunctionsViewGroup],
+    composite_implicit_autograd_index: BackendIndex,
+) -> List[str]:
+    @with_native_function
+    def emit_registration_helper(f: NativeFunction) -> str:
+        assert not f.has_composite_implicit_autograd_kernel
+        registration_str = f"TORCH_FN(functionalization::{wrapper_name(f.func)})"
+        return f'm.impl("{f.func.name}", {registration_str});'
+
+    # Don't generate kernels in mobile build
+    if not selector.include_all_operators:
+        return []
+
+    if isinstance(g, NativeFunctionsViewGroup):
+        # functionalization needs to register kernels for view + view_inplace ops
+        # See Note [Functionalization <> torch.Tensor constructor]
+        if str(g.view.func.name) == "lift_fresh":
+            return []
+        view_str = []
+        if not g.view.has_composite_implicit_autograd_kernel:
+            view_str.append(emit_registration_helper(g.view))
+        if (
+            g.view_inplace is not None
+            and not g.view_inplace.has_composite_implicit_autograd_kernel
+        ):
+            assert g.view_inplace.is_view_op
+            view_str.append(emit_registration_helper(g.view_inplace))
+        return view_str
+
+    elif isinstance(g, NativeFunctionsGroup):
+        # Gets a hand-written functionalization kernel
+        if g.inplace is not None and str(g.inplace.func.name) == "set_.source_Tensor":
+            fns = []
+        else:
+            fns = list(g.functions())
+    else:
+        if str(g.func.name) in MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION:
+            return []
+        fns = [g]
+
+    registrations = []
+    for f in fns:
+        if f.has_composite_implicit_autograd_kernel:
+            continue
+        if str(f.func.name) == "lift":
+            # See Note [Functionalization <> torch.Tensor constructor]
+            return []
+        if str(f.func.name) == "resize_":
+            # See Note [resize_ in Functionalization]
+            return []
+        if str(f.func.name.name) != "set_":
+            assert not f.is_view_op
+        # functionalization needs to generate and register kernels for inplace ops.
+        # We *also* need to directly register CompositeImplicitAUtograd kernels
+        # so that they decompose properly before functioanlization.
+        if modifies_arguments(f):
+            registrations.append(emit_registration_helper(f))
+    return registrations
+
+
+def gen_functionalization_definition(
+    selector: SelectiveBuilder,
+    # Note: Ideally this code should never have to look at NativeFunction
+    # (and instead only need to operate on grouped NativeFunctions).
+    # The only reason currently is because we need to emit direct dispatch registrations
+    # For CompositeImplicitAutograd operators, which are potentially ungrouped.
+    g: Union[NativeFunction, NativeFunctionsGroup, NativeFunctionsViewGroup],
+) -> List[str]:
+    # Don't generate kernels in mobile build
+    if not selector.include_all_operators:
+        return []
+
+    if isinstance(g, NativeFunctionsViewGroup):
+        # Case 1: emit view -> view_copy kernels for the functionalization pass
+        view_defs = []
+        if not g.composite:
+            # invariant: NativeFunctionsViewGroup's always have a view_copy operator
+            # if the view is not composite (implicit autograd)
+            assert g.view_copy is not None
+            view_defs.append(emit_view_functionalization_body(g, view_inplace=False))
+            if g.view_inplace is not None:
+                view_defs.append(emit_view_functionalization_body(g, view_inplace=True))
+        return view_defs
+    elif isinstance(g, NativeFunction):
+        # Invariant: all mutable operators that we need to handle in functionalization
+        # should have been properly grouped up.
+        # TODO: The below ops all have "problematic" schemas that prevent them from
+        # getting functionalized. Instead of bending over backwards to get things to work,
+        # I think we should either:
+        # (1) fix their schemas (BC-breaking)
+        # (2) hand-write their functionalization kernels
+        if (
+            str(g.func.name) not in MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION
+            and str(g.func.name.name) not in MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION
+        ):
+            assert g.has_composite_implicit_autograd_kernel or not modifies_arguments(g)
+        return []
+    else:
+        # Case 2: emit inplace -> out-of-place kernels for the functionalization pass
+        mutation_defs = []
+        mutation_defs.append(emit_inplace_functionalization_body(g.out, g))
+        if g.inplace is not None:
+            mutation_defs.append(emit_inplace_functionalization_body(g.inplace, g))
+        if g.mutable is not None:
+            mutation_defs.append(emit_inplace_functionalization_body(g.mutable, g))
+        return mutation_defs
+    return []
diff --git a/MLPY/Lib/site-packages/torchgen/gen_lazy_tensor.py b/MLPY/Lib/site-packages/torchgen/gen_lazy_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab7c18b1ba6129e2f1c8205c989d9df66a4771f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/gen_lazy_tensor.py
@@ -0,0 +1,605 @@
+import argparse
+import os
+import pathlib
+import re
+from collections import Counter, namedtuple
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+)
+
+import yaml
+
+import torchgen.dest as dest
+
+from torchgen.api.lazy import setValueT
+from torchgen.api.types import BaseCppType
+from torchgen.dest.lazy_ir import GenLazyIR, GenLazyNativeFuncDefinition, GenTSLazyIR
+from torchgen.gen import get_grouped_native_functions, parse_native_yaml
+
+from torchgen.model import NativeFunction, NativeFunctionsGroup, OperatorName
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import concatMap, FileManager, NamespaceHelper
+from torchgen.yaml_utils import YamlLoader
+from .gen_backend_stubs import (
+    error_on_missing_kernels,
+    gen_dispatcher_registrations,
+    gen_dispatchkey_nativefunc_headers,
+    parse_backend_yaml,
+)
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                        Lazy Tensor Codegen
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+# Overview
+# ~~~~~~~~
+#
+# This codegen script builds on existing data models and helpers used
+# by all ATen backends, and adds new functionality specific to lazy
+# tensor backends.
+#
+# Inputs:
+# - <backend>_native_functions.yaml: controls which operators are
+#   supported by the backend.
+#
+# Outputs:
+# (for all backends)
+# <DispatchKey>Ir.h defines Lazy IR classes to be constructed during tracing
+# - opt-in: also generate 'lowering' methods for the TorchScript backend only
+# <DispatchKey>NativeFunctions.cpp defines implementations of native functions which perform lazy tracing
+# - opt-in: 'full_codegen' section of backend yaml; 'supported' section omits these implementations
+# <DispatchKey>NativeFunctions.h declares implementations of native functions for both 'supported' and 'full_codegen'
+# ops
+#
+# Register<DispatchKey>.cpp registers all op implementations with the dispatcher
+# RegisterAutograd<DispatchKey>.cpp registers all autograd implementations with the dispatcher
+#
+# Validation Helpers:
+# - Shape Inference: errs if any ops in backend yaml require shape inference not provided by meta kernels or
+#   implementations in torch/csrc/lazy/core/shape_inference.*
+# - native function impls: errs if any 'supported' ops do not have an implementation defined in the backend
+#   (non-codegen) implementation file
+#
+#
+# About the Data Model
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# Modeled after ATen codegen, the first step is to parse yaml and build a data model for the operators
+# we care about.  In this case, the <backend>_native_functions yaml defines a subset of the core operators
+# (defined in more detail in the main native_functions.yaml), which will be supported by your backend.
+# Backends can list ops in two categories:
+#  - `supported` ops require hand-implementations but still get codegenned declarations and registrations
+#  - `full_codegen` ops get implementations (and IR classes) generated too
+#
+# Each native function is modeled as an object with a schema, and each schema has objects representing their
+# arguments.  Much of the codegen is manipulation of the arguments and their types.  For example, lazy tensor
+# backends need to transform 'at::Tensor' arguments into 'lazy::Value' objects, as well as replacing reference
+# types (stringref) with actual string objects, and this is done by manipulating the data model objects.
+# - see api/lazy.py for the lazy data model
+#
+# Once the data model is set up, the rest of this script processes a number of templates for output CPP file
+# and fills in the template values using helpers in `dest/lazy_ir.py` and `dest/lazy_ts_lowering.py`.  These
+# helpers mostly iterate over functions and their arguments, outputting different c++ snippets.
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+# Parses the external backend's yaml, and adds a new BackendIndex for the backend's dispatch key.
+# Returns a Tuple of (backend_key, autograd_key, cpp_namespace, updated BackendIndex mapping, full_codegen)
+ParsedExternalYaml = namedtuple(
+    "ParsedExternalYaml",
+    ["backend_key", "autograd_key", "cpp_namespace", "backend_indices", "full_codegen"],
+)
+
+
+def parse_native_functions_keys(
+    backend_yaml_path: str,
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+) -> Tuple[List[OperatorName], List[Any], List[OperatorName]]:
+    native_functions_map: Dict[OperatorName, NativeFunction] = {
+        f.func.name: f
+        for f in concatMap(
+            lambda f: [f] if isinstance(f, NativeFunction) else list(f.functions()),
+            grouped_native_functions,
+        )
+    }
+
+    with open(backend_yaml_path) as f:
+        yaml_values = yaml.load(f, Loader=YamlLoader)
+    assert isinstance(yaml_values, dict)
+
+    full_codegen = yaml_values.pop("full_codegen", [])
+    non_native = yaml_values.pop("non_native", [])
+    ir_gen = yaml_values.pop("ir_gen", [])
+    assert isinstance(full_codegen, list)
+    assert isinstance(non_native, list)
+    assert isinstance(ir_gen, list)
+    full_codegen_opnames = [OperatorName.parse(name) for name in full_codegen]
+    ir_gen_opnames = [OperatorName.parse(name) for name in ir_gen]
+    return full_codegen_opnames, non_native, ir_gen_opnames
+
+
+def validate_shape_inference_header(
+    shape_inference_hdr: str, expected_shape_infr_decls: List[str]
+) -> None:
+    try:
+        with open(shape_inference_hdr) as f:
+            shape_infr_decls = f.read()
+            shape_infr_decl_lines = set(shape_infr_decls.split("\n"))
+    except OSError as e:
+        raise AssertionError(
+            f"Unable to read from the specified shape_inference_hdr file: {shape_inference_hdr}"
+        ) from e
+
+    shape_infr_regex = r"compute_shape_(\w+)"
+    actual_shape_infr_name_counts = Counter(
+        re.findall(shape_infr_regex, shape_infr_decls)
+    )
+    # TODO(whc) add a check for shape inference functions that have meta kernels implement and should be retired.
+
+    missing_decls = [
+        decl for decl in expected_shape_infr_decls if decl not in shape_infr_decl_lines
+    ]
+    if missing_decls:
+        raise Exception(
+            f"""Missing shape inference function.\n
+Please add declare this function in {shape_inference_hdr}:\n
+and implement it in the corresponding shape_inference.cpp file.\n
+{os.linesep.join(missing_decls)}"""
+        )
+
+
+# Some helper functions for the codegen.
+def get_ltc_helper_fns() -> str:
+    return """\
+at::Tensor to_meta(const at::Tensor& tensor) {
+  // undefined tensors can't be converted to the meta device, since they don't have sizes/strides
+  if (!tensor.defined()) return tensor;
+  auto out = at::native::empty_strided_meta_symint(tensor.sym_sizes(), tensor.sym_strides(), \
+/*dtype=*/c10::make_optional(tensor.scalar_type()), /*layout=*/c10::make_optional(tensor.layout()), \
+/*device=*/c10::make_optional(c10::Device(c10::kMeta)), /*pin_memory=*/c10::nullopt);
+  // needs to handle wrapped numbers, so dtype promotion works properly.
+  if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
+    out.unsafeGetTensorImpl()->set_wrapped_number(true);
+  }
+  return out;
+}
+c10::optional<at::Tensor> to_meta(const c10::optional<at::Tensor>& tensor) {
+  if (tensor.has_value()) {
+    return to_meta(*tensor);
+  }
+  return c10::nullopt;
+}
+
+std::vector<at::Tensor> to_meta(at::ITensorListRef t_list) {
+  std::vector<at::Tensor> outs;
+  outs.reserve(t_list.size());
+  for (const auto& tensor : t_list) {
+    outs.push_back(to_meta(tensor));
+  }
+  return outs;
+}
+"""
+
+
+class default_args:
+    node_base: str = "Node"
+    node_base_hdr: Optional[str] = None
+    shape_inference_hdr: str = "torch/csrc/lazy/core/shape_inference.h"
+    tensor_class: str = "torch::lazy::LazyTensor"
+    tensor_class_hdr: str = "torch/csrc/lazy/core/tensor.h"
+    lazy_ir_generator: Type[GenLazyIR] = GenLazyIR
+    native_func_definition_generator: Type[
+        GenLazyNativeFuncDefinition
+    ] = GenLazyNativeFuncDefinition
+    backend_name: str = "TorchScript"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate Lazy Tensor backend files")
+    parser.add_argument(
+        "-s",
+        "--source-yaml",
+        "--source_yaml",
+        help="path to source yaml file containing operator external definitions",
+    )
+    parser.add_argument("-o", "--output-dir", "--output_dir", help="output directory")
+    parser.add_argument(
+        "--dry-run", "--dry_run", type=bool, default=False, help="output directory"
+    )
+    parser.add_argument(
+        "--impl-path",
+        "--impl_path",
+        type=str,
+        default=None,
+        help="path to the source C++ file containing kernel definitions",
+    )
+    parser.add_argument(
+        "--gen-ts-lowerings",
+        "--gen_ts_lowerings",
+        action="store_true",
+        help="Generate TorchScript lowerings in addition to Lazy IR and NativeFunctions",
+    )
+    parser.add_argument(
+        "--node-base",
+        "--node_base",
+        type=str,
+        default=default_args.node_base,
+        help="Name of backend specific custom Lazy IR Node base class",
+    )
+    parser.add_argument(
+        "--node-base-hdr",
+        "--node_base_hdr",
+        type=str,
+        default=default_args.node_base_hdr,
+        help="Path to header file defining custom Lazy IR Node base class",
+    )
+    parser.add_argument(
+        "--shape-inference-hdr",
+        "--shape_inference_hdr",
+        type=str,
+        default=default_args.shape_inference_hdr,
+        help="Path to header file defining custom Lazy shape inference functions",
+    )
+    parser.add_argument(
+        "--tensor-class",
+        "--tensor_class",
+        type=str,
+        default=default_args.tensor_class,
+        help="Name of backend specific custom Lazy Tensor class",
+    )
+    parser.add_argument(
+        "--tensor-class-hdr",
+        "--tensor_class_hdr",
+        type=str,
+        default=default_args.tensor_class_hdr,
+        help="Path to header file defining custom Lazy Tensor class",
+    )
+    parser.add_argument(
+        "--backend-name",
+        "--backend_name",
+        type=str,
+        default=default_args.backend_name,
+        help="Name of the backend to generate",
+    )
+    options = parser.parse_args()
+
+    # Assumes that this file lives at PYTORCH_ROOT/torchgen/gen_backend_stubs.py
+    torch_root = pathlib.Path(__file__).parent.parent.parent.absolute()
+    aten_path = str(torch_root / "aten" / "src" / "ATen")
+    lazy_ir_generator: Type[GenLazyIR] = default_args.lazy_ir_generator
+    if options.gen_ts_lowerings:
+        lazy_ir_generator = GenTSLazyIR
+    native_func_definition_generator: Type[
+        GenLazyNativeFuncDefinition
+    ] = default_args.native_func_definition_generator
+
+    run_gen_lazy_tensor(
+        aten_path,
+        options.source_yaml,
+        options.output_dir,
+        options.dry_run,
+        options.impl_path,
+        options.node_base,
+        options.node_base_hdr,
+        options.tensor_class,
+        options.tensor_class_hdr,
+        options.shape_inference_hdr,
+        lazy_ir_generator,
+        native_func_definition_generator,
+        options.backend_name,
+    )
+
+
+def run_gen_lazy_tensor(
+    aten_path: str,
+    source_yaml: str,
+    output_dir: str,
+    dry_run: bool,
+    impl_path: Optional[str],
+    node_base: str = default_args.node_base,
+    node_base_hdr: Optional[str] = default_args.node_base_hdr,
+    tensor_class: str = default_args.tensor_class,
+    tensor_class_hdr: str = default_args.tensor_class_hdr,
+    shape_inference_hdr: str = default_args.shape_inference_hdr,
+    lazy_ir_generator: Type[GenLazyIR] = default_args.lazy_ir_generator,
+    native_func_definition_generator: Type[
+        GenLazyNativeFuncDefinition
+    ] = default_args.native_func_definition_generator,
+    # build_in_tree is true for TS backend and affects include paths
+    build_in_tree: bool = False,
+    # per_operator_headers changes whether ATen/Functions.h or individual operator headers are used
+    # it must match how ATen was built
+    per_operator_headers: bool = False,
+    backend_name: str = default_args.backend_name,
+    gen_forced_fallback_code: bool = False,
+    use_lazy_shape: bool = True,
+    # the following arguments are temporary customization points for xla backend migration.
+    # do not rely on them otherwise, they should be removed once migration is complete
+    backend_namespace: str = "torch::lazy",
+    get_tensorlist: str = "GetTensorList",
+    get_tensor_or_wrap_number: str = "GetLtcTensorOrCreateForWrappedNumber",
+    try_get_tensor: str = "TryGetLtcTensor",
+    metrics_counter: str = 'TORCH_LAZY_FN_COUNTER("lazy::")',
+    create_tensor: str = "LazyTensor::Create",
+    create_from_first_tensor: bool = False,
+    create_aten_from_ltc_tensor: str = "torch::lazy::CreateAtenFromLtcTensor",
+    tuple_aten_from_ltc_tensors: str = "torch::lazy::TupleAtenFromLtcTensors",
+    lazy_value_class: str = "torch::lazy::Value",
+    lazy_tensor_ptr: str = "LazyTensorPtr",
+    get_device_fn: str = "torch::lazy::GetBackendDevice",
+) -> None:
+    lv_tokens = lazy_value_class.split("::")
+    lv_class = lv_tokens[-1]
+    lv_ns = "::".join(lv_tokens[:-1])
+    setValueT(BaseCppType(lv_ns, lv_class))
+    template_dir = os.path.join(aten_path, "templates")
+
+    def make_file_manager(install_dir: str) -> FileManager:
+        return FileManager(
+            install_dir=install_dir, template_dir=template_dir, dry_run=dry_run
+        )
+
+    fm = make_file_manager(output_dir)
+
+    native_yaml_path = os.path.join(aten_path, "native/native_functions.yaml")
+    tags_yaml_path = os.path.join(aten_path, "native/tags.yaml")
+    parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path)
+    native_functions, backend_indices = (
+        parsed_yaml.native_functions,
+        parsed_yaml.backend_indices,
+    )
+    grouped_native_functions = get_grouped_native_functions(native_functions)
+
+    def sort_native_function(f: Union[NativeFunctionsGroup, NativeFunction]) -> str:
+        """
+        We sort the native function because of the note in concat_map_codegen.
+        TODO(alanwaketan): Remove this sorting hack once all ops are grouped properly.
+        """
+        func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
+        return str(func.name.name)
+
+    grouped_native_functions = sorted(
+        grouped_native_functions, key=sort_native_function
+    )
+
+    parsed_backend_yaml = parse_backend_yaml(
+        source_yaml, grouped_native_functions, backend_indices
+    )
+    backend_key = parsed_backend_yaml.backend_key
+    autograd_key = parsed_backend_yaml.autograd_key
+    cpp_namespace = parsed_backend_yaml.cpp_namespace
+    backend_indices = parsed_backend_yaml.backend_indices
+    # the following 3 keys are all processed differently
+    # for full_codegen, we generate IR, kernels, etc
+    # for ir_gen, we generate only IR
+    # non_native is used to register kernels not declared in
+    # native_functions.yaml
+    full_codegen, non_native, ir_gen = parse_native_functions_keys(
+        source_yaml, grouped_native_functions
+    )
+
+    def concat_map_codegen(
+        func: Callable[[NativeFunction], Sequence[str]],
+        xs: Iterable[Union[NativeFunctionsGroup, NativeFunction]],
+        ops_list: List[OperatorName] = full_codegen,
+    ) -> Iterator[str]:
+        """
+        We code-gen for the functional variant, which is all we need for IR classes/lowerings/shape inferences, but we
+        only code-gen additional entries for the inplace variant for the native functions.
+        """
+
+        for x in xs:
+            fs = list(x.functions()) if isinstance(x, NativeFunctionsGroup) else [x]
+            for f in fs:
+                if f.func.name in ops_list:
+                    yield from func(f)
+
+    selector = SelectiveBuilder.get_nop_selector()
+
+    assert backend_key is not None
+    class_name = backend_indices[backend_key].native_function_class_name()
+
+    if impl_path is not None:
+        error_on_missing_kernels(
+            native_functions,
+            backend_indices,
+            backend_key,
+            autograd_key,
+            class_name,
+            impl_path,
+            full_codegen,
+        )
+
+    """ Validate Shape Inference Definitions
+
+    Generated lazy native functions all perform shape inference, by first using a meta:: kernel
+    if available for that op, and otherwise using a 'compute_shape_{op}' function instead.  The generator
+    knows the call signature for compute_shape_{op} because it matches the nativefunction (and meta::) signature,
+    so it just has to check whether the op is structured and generate a call for one or the other.  It's up to the dev
+    to supply the missing compute_shape_{op} function, but the codegen at least warns you about this and provides
+    the expected signature which can be copy-pasted into shape_inference.h.
+
+    compute_shape_{op} functions are handwritten and should be replaced over time as ops get ported
+    to structured kernels.
+
+    See torch/csrc/lazy/core/shape_inference.cpp #READ THIS! for more information.
+    """
+    if shape_inference_hdr is not None:
+        expected_shape_infr_decls = list(
+            concat_map_codegen(
+                dest.GenLazyShapeInferenceDefinition(
+                    backend_indices[backend_key], tensor_class
+                ),
+                grouped_native_functions,
+            )
+        )
+
+        validate_shape_inference_header(shape_inference_hdr, expected_shape_infr_decls)
+    assert class_name is not None
+
+    # Generate nativefunction declarations
+    # Note, eager registrations is set to False for the lazy TS backend as another LTC backend
+    # may want to register their own lazy kernels instead of registering the TS ones.
+    # The registration will lazily happen when init_ts_backend is called.
+    gen_dispatchkey_nativefunc_headers(
+        fm,
+        class_name,
+        cpp_namespace,
+        backend_indices,
+        grouped_native_functions,
+        backend_key,
+        autograd_key,
+        backend_name,
+    )
+
+    # Generate Dispatcher registrations which hook up the nativefunctions
+    for dispatch_key in (
+        [backend_key] if autograd_key is None else [backend_key, autograd_key]
+    ):
+        gen_dispatcher_registrations(
+            fm,
+            output_dir,
+            class_name,
+            backend_indices,
+            grouped_native_functions,
+            backend_key,
+            dispatch_key,
+            selector,
+            build_in_tree=build_in_tree,
+            per_operator_headers=per_operator_headers,
+            backend_name=backend_name,
+            eager_registration=False,
+        )
+
+    # Generate native function impls that build IR nodes
+    ns_helper = NamespaceHelper(cpp_namespace)
+    fm.write_with_template(
+        f"{backend_key}NativeFunctions.cpp",
+        "DispatchKeyNativeFunctions.cpp",
+        lambda: {
+            "includes": [
+                f"#include <{path}>"
+                for path in [
+                    tensor_class_hdr,
+                    shape_inference_hdr,
+                    "ATen/Functions.h",
+                    "ATen/native/TensorConversions.h",
+                    "ATen/NativeFunctions.h",
+                    "ATen/CompositeExplicitAutogradNonFunctionalFunctions.h",
+                    "ATen/MetaFunctions.h",
+                    "ATen/Operators.h",
+                    "ATen/native/CPUFallback.h",
+                    "torch/csrc/lazy/core/ir_builder.h",
+                    "torch/csrc/lazy/core/lazy_graph_executor.h",
+                    "torch/csrc/lazy/core/metrics.h",
+                    "torch/csrc/lazy/core/shape.h",
+                    f"{output_dir}/{backend_key}NativeFunctions.h",
+                    f"{output_dir}/LazyIr.h",
+                ]
+                + (
+                    ["torch/csrc/lazy/ts_backend/ts_eager_fallback.h"]
+                    if gen_forced_fallback_code
+                    else []
+                )
+            ],
+            "helper_fns": get_ltc_helper_fns(),
+            "native_functions_include": "",
+            "namespace_prologue": ns_helper.prologue,
+            "namespace_epilogue": ns_helper.epilogue,
+            "native_function_definitions": list(
+                concat_map_codegen(
+                    native_func_definition_generator(
+                        f"{backend_key}NativeFunctions",
+                        backend_indices[backend_key],
+                        tensor_class,
+                        gen_forced_fallback_code,
+                        backend_namespace,
+                        get_tensorlist,
+                        get_tensor_or_wrap_number,
+                        try_get_tensor,
+                        metrics_counter,
+                        create_tensor,
+                        create_from_first_tensor,
+                        create_aten_from_ltc_tensor,
+                        tuple_aten_from_ltc_tensors,
+                        lazy_tensor_ptr,
+                        get_device_fn,
+                    ),
+                    grouped_native_functions,
+                )
+            ),
+        },
+    )
+    # Generate IR node classes
+    lazy_ir_obj = lazy_ir_generator(
+        backend_indices[backend_key], backend_name, node_base, use_lazy_shape
+    )
+
+    fm.write_with_template(
+        "LazyIr.h",
+        "LazyIr.h",
+        lambda: {
+            "lazy_ir_sysinc": [
+                f"#include <{path}>"
+                for path in [
+                    "ATen/core/Formatting.h",
+                    "c10/core/ScalarType.h",
+                    "c10/util/Optional.h",
+                    "torch/csrc/lazy/core/hash.h",
+                    "torch/csrc/lazy/core/ir.h",
+                    "torch/csrc/lazy/core/shape.h",
+                    "vector",
+                ]
+            ],
+            "lazy_ir_inc": [f'#include "{node_base_hdr}"']
+            if node_base_hdr is not None
+            else [],
+            "ir_declarations": list(
+                concat_map_codegen(
+                    lazy_ir_obj, grouped_native_functions, full_codegen + ir_gen
+                )
+            ),
+            "namespace_prologue": ns_helper.prologue,
+            "namespace_epilogue": ns_helper.epilogue,
+        },
+    )
+
+    # Generate Non Native IR Node classes
+    fm.write_with_template(
+        "LazyNonNativeIr.h",
+        "LazyNonNativeIr.h",
+        lambda: {
+            "lazy_non_native_ir_inc": [
+                f"#include <{path}>"
+                for path in [
+                    "torch/csrc/lazy/core/ir.h",
+                    "torch/csrc/lazy/core/ir_builder.h",
+                    "torch/csrc/lazy/core/internal_ops/ltc_ops.h",
+                    "torch/csrc/lazy/core/shape_inference.h",
+                ]
+                + ([node_base_hdr] if node_base_hdr else [])
+                if path
+            ],
+            "non_native_ir_nodes": dest.generate_non_native_lazy_ir_nodes(
+                non_native, lazy_ir_obj
+            ),
+            "namespace_prologue": ns_helper.prologue,
+            "namespace_epilogue": ns_helper.epilogue,
+        },
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torchgen/gen_vmap_plumbing.py b/MLPY/Lib/site-packages/torchgen/gen_vmap_plumbing.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d0a6440f66a8121ce20a2b46a2e0bee8616f03d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/gen_vmap_plumbing.py
@@ -0,0 +1,265 @@
+import textwrap
+from dataclasses import dataclass
+from typing import List, Optional, Sequence, Tuple
+
+from torchgen.api.translate import translate
+from torchgen.api.types import DispatcherSignature
+from torchgen.context import method_with_native_function
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Return,
+    SchemaKind,
+    Type,
+)
+from torchgen.utils import mapMaybe
+
+
+def is_tensor(typ: Type) -> bool:
+    return isinstance(typ, BaseType) and typ.name == BaseTy.Tensor
+
+
+def is_optional_tensor(typ: Type) -> bool:
+    return isinstance(typ, OptionalType) and is_tensor(typ.elem)
+
+
+def is_tensor_list(typ: Type) -> bool:
+    return isinstance(typ, ListType) and is_tensor(typ.elem)
+
+
+def unwrap_tensor(name: str, cur_level_var: str) -> List[str]:
+    result = f"""\
+    Tensor {name}_value;
+    optional<int64_t> {name}_bdim;
+    std::tie({name}_value, {name}_bdim) = unwrapTensorAtLevel({name}, {cur_level_var});"""
+    return textwrap.dedent(result).split("\n")
+
+
+def unwrap_optional_tensor(name: str, cur_level_var: str) -> List[str]:
+    result = f"""\
+    optional<Tensor> {name}_value;
+    optional<int64_t> {name}_bdim;
+    if ({name}) {{
+        std::tie({name}_value, {name}_bdim) = unwrapTensorAtLevel({name}.value(), {cur_level_var});
+    }}"""
+    return textwrap.dedent(result).split("\n")
+
+
+def gen_unwraps(
+    flat_arguments: Sequence[Argument], cur_level_var: str
+) -> Tuple[str, List[str]]:
+    arg_names = [a.name for a in flat_arguments]
+    arg_types = [a.type for a in flat_arguments]
+
+    tensors = [name for typ, name in zip(arg_types, arg_names) if is_tensor(typ)]
+    optional_tensors = [
+        name for typ, name in zip(arg_types, arg_names) if is_optional_tensor(typ)
+    ]
+
+    unwraps = []
+    for tensor in tensors:
+        unwraps += unwrap_tensor(tensor, cur_level_var)
+
+    for opt_tensor in optional_tensors:
+        unwraps += unwrap_optional_tensor(opt_tensor, cur_level_var)
+    unwrap_code = "\n".join(unwraps)
+
+    unwrapped_arg_list = []
+    for arg in arg_names:
+        if arg in tensors or arg in optional_tensors:
+            unwrapped_arg_list += [f"{arg}_value", f"{arg}_bdim"]
+        else:
+            unwrapped_arg_list.append(arg)
+    return unwrap_code, unwrapped_arg_list
+
+
+def gen_case_where_all_bdims_are_none(
+    outer_sig: DispatcherSignature, schema: FunctionSchema, cur_level_var: str
+) -> str:
+    conditions = []
+    flat_args = schema.arguments.flat_all
+    for arg in flat_args:
+        if not arg.type.is_tensor_like():
+            continue
+        conditions.append(f"!isBatchedAtLevel({arg.name}, {cur_level_var})")
+
+    sig = DispatcherSignature.from_schema(schema)
+    translated_args = ", ".join(
+        e.expr for e in translate(outer_sig.arguments(), sig.arguments())
+    )
+    return f"""\
+if ({' && '.join(conditions)}) {{
+  return at::_ops::{sig.func.name.unambiguous_name()}::call({translated_args});
+}}"""
+
+
+def gen_returns(
+    returns: Tuple[Return, ...], cur_level_var: str, results_var: str
+) -> str:
+    idx = 0
+    wrapped_returns = []
+    for ret in returns:
+        if is_tensor(ret.type):
+            wrapped_returns.append(
+                f"makeBatched(std::get<{idx}>({results_var}), std::get<{idx + 1}>({results_var}), {cur_level_var})"
+            )
+            idx += 2
+        elif is_tensor_list(ret.type):
+            wrapped_returns.append(
+                f"makeBatchedVector(std::get<{idx}>({results_var}), std::get<{idx+1}>({results_var}), {cur_level_var})"
+            )
+            idx += 2
+        else:
+            wrapped_returns.append(f"std::get<{idx}>({results_var})")
+            idx += 1
+    if len(wrapped_returns) == 1:
+        result = f"return {wrapped_returns[0]};"
+    else:
+        result = f'return std::make_tuple({", ".join(wrapped_returns)});'
+    return result
+
+
+def accepts_at_least_one_tensor_input(schema: FunctionSchema) -> bool:
+    return any(a.type.is_tensor_like() for a in schema.arguments.flat_all)
+
+
+def is_mutated_arg(argument: Argument) -> bool:
+    return argument.annotation is not None and argument.annotation.is_write
+
+
+def gen_vmap_inplace_plumbing(native_function: NativeFunction) -> Optional[str]:
+    # Assumptions:
+    # - only one argument is being modified in-place
+    # - the argument that is being modified in-place is the first argument
+    # - all returns are either Tensor, tuple of Tensor, or TensorList
+    schema = native_function.func
+    sig = DispatcherSignature.from_schema(schema)
+    returns = schema.returns
+
+    # Check assumptions. If these are invalid we return None
+    # and punt the work to handle them to the future.
+    assert schema.kind() == SchemaKind.inplace
+    if not is_mutated_arg(schema.arguments.flat_all[0]):
+        return None
+    if not len([arg for arg in schema.arguments.flat_all if is_mutated_arg(arg)]) == 1:
+        return None
+
+    # Only support cases where all returns are Tensors or vector<Tensor>
+    if len(returns) == 0:
+        return None
+    if not all(is_tensor(ret.type) or is_tensor_list(ret.type) for ret in returns):
+        return None
+    if not accepts_at_least_one_tensor_input(schema):
+        return None
+
+    cur_level_var = "cur_level"
+
+    unwraps, unwrapped_arg_list = gen_unwraps(schema.arguments.flat_all, cur_level_var)
+    bdims_all_none_case = gen_case_where_all_bdims_are_none(sig, schema, cur_level_var)
+
+    return f"""\
+template <typename batch_rule_t, batch_rule_t batch_rule>
+{sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+  auto maybe_layer = maybeCurrentDynamicLayer();
+  vmap_check_escaped(maybe_layer, "gen_vmap_inplace_plumbing");
+  int64_t {cur_level_var} = maybe_layer->layerId();
+{textwrap.indent(bdims_all_none_case, "  ")}
+{textwrap.indent(unwraps, "  ")}
+  batch_rule({', '.join(unwrapped_arg_list)});
+  return {schema.arguments.flat_all[0].name};
+}}"""
+
+
+def gen_vmap_plumbing_no_returns(native_function: NativeFunction) -> str:
+    schema = native_function.func
+    sig = DispatcherSignature.from_schema(schema)
+    cur_level_var = "cur_level"
+
+    unwraps, unwrapped_arg_list = gen_unwraps(schema.arguments.flat_all, cur_level_var)
+    bdims_all_none_case = gen_case_where_all_bdims_are_none(sig, schema, cur_level_var)
+
+    return f"""\
+template <typename batch_rule_t, batch_rule_t batch_rule>
+{sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+  auto maybe_layer = maybeCurrentDynamicLayer();
+  vmap_check_escaped(maybe_layer, "gen_vmap_plumbing_no_returns");
+  int64_t {cur_level_var} = maybe_layer->layerId();
+{textwrap.indent(bdims_all_none_case, "  ")}
+{textwrap.indent(unwraps, "  ")}
+  batch_rule({', '.join(unwrapped_arg_list)});
+}}"""
+
+
+def gen_vmap_plumbing(native_function: NativeFunction) -> Optional[str]:
+    schema = native_function.func
+    sig = DispatcherSignature.from_schema(schema)
+    returns = schema.returns
+
+    # Only support cases where all returns are Tensors or vector<Tensor>
+    if not accepts_at_least_one_tensor_input(schema):
+        return None
+    if len(returns) == 0:
+        return gen_vmap_plumbing_no_returns(native_function)
+    if not all(ret.type.is_tensor_like() for ret in returns):
+        return None
+    # in-place views need special handling
+    if "inplace_view" in native_function.tags:
+        return None
+
+    if schema.kind() == SchemaKind.inplace:
+        return gen_vmap_inplace_plumbing(native_function)
+
+    # Don't support these (mutable, out, scratch)
+    if schema.kind() != SchemaKind.functional:
+        return None
+
+    results_var = "results"
+    cur_level_var = "cur_level"
+
+    unwraps, unwrapped_arg_list = gen_unwraps(schema.arguments.flat_all, cur_level_var)
+    bdims_all_none_case = gen_case_where_all_bdims_are_none(sig, schema, cur_level_var)
+
+    wrapped_returns = gen_returns(returns, cur_level_var, results_var)
+    return f"""\
+template <typename batch_rule_t, batch_rule_t batch_rule>
+{sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+  auto maybe_layer = maybeCurrentDynamicLayer();
+  vmap_check_escaped(maybe_layer, "gen_vmap_plumbing");
+  int64_t {cur_level_var} = maybe_layer->layerId();
+{textwrap.indent(bdims_all_none_case, "  ")}
+{textwrap.indent(unwraps, "  ")}
+  auto {results_var} = batch_rule({', '.join(unwrapped_arg_list)});
+  {wrapped_returns}
+}}"""
+
+
+@dataclass(frozen=True)
+class ComputeBatchRulePlumbing:
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        opname = str(f.func.name)
+        result = gen_vmap_plumbing(f)
+        return result
+
+
+def gen_all_vmap_plumbing(native_functions: Sequence[NativeFunction]) -> str:
+    body = "\n".join(list(mapMaybe(ComputeBatchRulePlumbing(), native_functions)))
+    return f"""
+#pragma once
+#include <ATen/Operators.h>
+#include <ATen/functorch/PlumbingHelper.h>
+
+namespace at {{ namespace functorch {{
+
+{body}
+
+}}}} // namespace at::functorch
+"""
diff --git a/MLPY/Lib/site-packages/torchgen/local.py b/MLPY/Lib/site-packages/torchgen/local.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfdaccf4859549d14381bc3c1149ba769910a5a1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/local.py
@@ -0,0 +1,56 @@
+import threading
+from contextlib import contextmanager
+from typing import Iterator, Optional
+
+# Simple dynamic scoping implementation.  The name "parametrize" comes
+# from Racket.
+#
+# WARNING WARNING: LOOKING TO EDIT THIS FILE?  Think carefully about
+# why you need to add a toggle to the global behavior of code
+# generation.  The parameters here should really only be used
+# for "temporary" situations, where we need to temporarily change
+# the codegen in some cases because we cannot conveniently update
+# all call sites, and are slated to be eliminated once all call
+# sites are eliminated.  If you don't have a plan for how to get there,
+# DON'T add a new entry here.
+
+
+class Locals(threading.local):
+    use_const_ref_for_mutable_tensors: Optional[bool] = None
+    use_ilistref_for_tensor_lists: Optional[bool] = None
+
+
+_locals = Locals()
+
+
+def use_const_ref_for_mutable_tensors() -> bool:
+    assert _locals.use_const_ref_for_mutable_tensors is not None, (
+        "need to initialize local.use_const_ref_for_mutable_tensors with "
+        "local.parametrize"
+    )
+    return _locals.use_const_ref_for_mutable_tensors
+
+
+def use_ilistref_for_tensor_lists() -> bool:
+    assert _locals.use_ilistref_for_tensor_lists is not None, (
+        "need to initialize local.use_ilistref_for_tensor_lists with "
+        "local.parametrize"
+    )
+    return _locals.use_ilistref_for_tensor_lists
+
+
+@contextmanager
+def parametrize(
+    *, use_const_ref_for_mutable_tensors: bool, use_ilistref_for_tensor_lists: bool
+) -> Iterator[None]:
+    old_use_const_ref_for_mutable_tensors = _locals.use_const_ref_for_mutable_tensors
+    old_use_ilistref_for_tensor_lists = _locals.use_ilistref_for_tensor_lists
+    try:
+        _locals.use_const_ref_for_mutable_tensors = use_const_ref_for_mutable_tensors
+        _locals.use_ilistref_for_tensor_lists = use_ilistref_for_tensor_lists
+        yield
+    finally:
+        _locals.use_const_ref_for_mutable_tensors = (
+            old_use_const_ref_for_mutable_tensors
+        )
+        _locals.use_ilistref_for_tensor_lists = old_use_ilistref_for_tensor_lists
diff --git a/MLPY/Lib/site-packages/torchgen/model.py b/MLPY/Lib/site-packages/torchgen/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed7c3648ad6a9b625d10ca6da1094481b51229c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/model.py
@@ -0,0 +1,2795 @@
+import dataclasses
+import itertools
+import re
+
+from dataclasses import dataclass
+from enum import auto, Enum
+from typing import Callable, Dict, Iterator, List, Optional, Sequence, Set, Tuple, Union
+
+from torchgen.utils import assert_never, NamespaceHelper, OrderedSet
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           DATA MODEL
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+# Some general principles for our data model.
+#
+# - Stop using C++ data types as the internal data representation
+#   format.  Instead, the internal data structures are centered
+#   around JIT schema representation.  This avoid a big problem
+#   with the old codegen where we read in all the types from
+#   native_functions.yaml and then immediately had to retranslate
+#   them into C++ types.
+#
+# - More semantic data representation.  Instead of representing
+#   everything as dicts and strings, we define dataclasses for
+#   every interesting entity the code generation has to deal with.
+#   These dataclasses have strong semantic invariants: for example,
+#   we generally require them to roundtrip losslessly into the
+#   form they were parsed from.  These structures are immutable
+#   and you're expected to populate information once during
+#   construction.
+
+
+# Represent a source location; used for better error reporting
+@dataclass(frozen=True)
+class Location:
+    file: str
+    line: int
+
+    def __str__(self) -> str:
+        return f"{self.file}:{self.line}"
+
+
+# Valid values of the 'variants' field in native_functions.yaml
+class Variant(Enum):
+    function = auto()
+    method = auto()
+
+
+# Default kernel namespace
+DEFAULT_KERNEL_NAMESPACE = "at::native"
+
+# NOTE: Keep the list in sync with `DispatchKey` in c10/core/DispatchKey.h
+BACKEND_COMPONENTS = "CPU CUDA HIP XLA MTIA MPS IPU XPU HPU VE Lazy Meta PrivateUse1 PrivateUse2 PrivateUse3".split()
+FUNCTIONALITY_KEYS = [
+    "",
+    "Quantized",
+    "Sparse",
+    "SparseCsr",
+    "NestedTensor",
+    "Autograd",
+]
+
+# This list guards dispatches that can be used in derivatives.yaml
+# For now we omit AutogradFunctionality and AutogradOther
+AUTOGRAD_KEYS = ["AutogradNestedTensor"] + [
+    "Autograd" + component for component in BACKEND_COMPONENTS
+]
+
+FRAGMENT_NAMESPACES = {"quantized", "quantized_decomposed"}
+
+
+# This doesn't have to be in sync with the header, it only needs to contain
+# entries that we actually use in the codegen or want pyi entries for
+class DispatchKey(Enum):
+    Undefined = 0
+    CatchAll = Undefined
+
+    FPGA = auto()
+    ORT = auto()
+    Vulkan = auto()
+    Metal = auto()
+    MKLDNN = auto()
+    OpenGL = auto()
+    OpenCL = auto()
+    IDEEP = auto()
+    CustomRNGKeyId = auto()
+    MkldnnCPU = auto()
+    Sparse = auto()
+    SparseCsr = auto()
+    NestedTensor = auto()
+    Dense = auto()
+
+    PreDispatch = auto()
+    Python = auto()
+    FuncTorchDynamicLayerBackMode = auto()
+    ZeroTensor = auto()
+    Conjugate = auto()
+    Negative = auto()
+    BackendSelect = auto()
+    Named = auto()
+    AutogradOther = auto()
+    AutogradFunctionality = auto()
+    AutogradNestedTensor = auto()
+    Tracer = auto()
+    Autocast = auto()
+    Batched = auto()
+    VmapMode = auto()
+    FuncTorchGradWrapper = auto()
+    FuncTorchBatched = auto()
+    BatchedNestedTensor = auto()
+    FuncTorchVmapMode = auto()
+    FuncTorchDynamicLayerFrontMode = auto()
+    Functionalize = auto()
+    TESTING_ONLY_GenericWrapper = auto()
+    TESTING_ONLY_GenericMode = auto()
+
+    ADInplaceOrView = auto()
+    Autograd = auto()
+    CompositeImplicitAutograd = auto()
+    CompositeImplicitAutogradNestedTensor = auto()
+    CompositeExplicitAutograd = auto()
+    CompositeExplicitAutogradNonFunctional = auto()
+    FuncTorchBatchedDecomposition = auto()
+
+    # BEGIN autogenerated
+    CPU = auto()
+    CUDA = auto()
+    HIP = auto()
+    XLA = auto()
+    MTIA = auto()
+    MPS = auto()
+    IPU = auto()
+    XPU = auto()
+    HPU = auto()
+    VE = auto()
+    Lazy = auto()
+    Meta = auto()
+    PrivateUse1 = auto()
+    PrivateUse2 = auto()
+    PrivateUse3 = auto()
+    QuantizedCPU = auto()
+    QuantizedCUDA = auto()
+    QuantizedHIP = auto()
+    QuantizedXLA = auto()
+    QuantizedMTIA = auto()
+    QuantizedMPS = auto()
+    QuantizedIPU = auto()
+    QuantizedXPU = auto()
+    QuantizedHPU = auto()
+    QuantizedVE = auto()
+    QuantizedLazy = auto()
+    QuantizedMeta = auto()
+    QuantizedPrivateUse1 = auto()
+    QuantizedPrivateUse2 = auto()
+    QuantizedPrivateUse3 = auto()
+    SparseCPU = auto()
+    SparseCUDA = auto()
+    SparseHIP = auto()
+    SparseXLA = auto()
+    SparseMTIA = auto()
+    SparseMPS = auto()
+    SparseIPU = auto()
+    SparseXPU = auto()
+    SparseHPU = auto()
+    SparseVE = auto()
+    SparseLazy = auto()
+    SparseMeta = auto()
+    SparsePrivateUse1 = auto()
+    SparsePrivateUse2 = auto()
+    SparsePrivateUse3 = auto()
+    SparseCsrCPU = auto()
+    SparseCsrCUDA = auto()
+    SparseCsrHIP = auto()
+    SparseCsrXLA = auto()
+    SparseCsrMTIA = auto()
+    SparseCsrMPS = auto()
+    SparseCsrIPU = auto()
+    SparseCsrXPU = auto()
+    SparseCsrHPU = auto()
+    SparseCsrVE = auto()
+    SparseCsrLazy = auto()
+    SparseCsrMeta = auto()
+    SparseCsrPrivateUse1 = auto()
+    SparseCsrPrivateUse2 = auto()
+    SparseCsrPrivateUse3 = auto()
+    NestedTensorCPU = auto()
+    NestedTensorCUDA = auto()
+    NestedTensorHIP = auto()
+    NestedTensorXLA = auto()
+    NestedTensorMTIA = auto()
+    NestedTensorMPS = auto()
+    NestedTensorIPU = auto()
+    NestedTensorXPU = auto()
+    NestedTensorHPU = auto()
+    NestedTensorVE = auto()
+    NestedTensorLazy = auto()
+    NestedTensorMeta = auto()
+    NestedTensorPrivateUse1 = auto()
+    NestedTensorPrivateUse2 = auto()
+    NestedTensorPrivateUse3 = auto()
+    AutogradCPU = auto()
+    AutogradCUDA = auto()
+    AutogradHIP = auto()
+    AutogradXLA = auto()
+    AutogradMTIA = auto()
+    AutogradMPS = auto()
+    AutogradIPU = auto()
+    AutogradXPU = auto()
+    AutogradHPU = auto()
+    AutogradVE = auto()
+    AutogradLazy = auto()
+    AutogradMeta = auto()
+    AutogradPrivateUse1 = auto()
+    AutogradPrivateUse2 = auto()
+    AutogradPrivateUse3 = auto()
+    # END autogenerated
+
+    def __str__(self) -> str:
+        return self.name
+
+    def lower(self) -> str:
+        return str(self).lower()
+
+    @staticmethod
+    def parse(value: str) -> "DispatchKey":
+        for k, v in DispatchKey.__members__.items():
+            if k == value:
+                return v
+        raise AssertionError(f"unknown dispatch key {value}")
+
+
+class _TorchDispatchModeKey(Enum):
+    FAKE = auto()
+    PROXY = auto()
+    FUNCTIONAL = auto()
+
+
+def codegen_per_backend_entries() -> str:
+    r = []
+    for fk in FUNCTIONALITY_KEYS:
+        for bc in BACKEND_COMPONENTS:
+            r.append(f"    {fk}{bc} = auto()")
+    return "\n".join(r)
+
+
+for fk in FUNCTIONALITY_KEYS:
+    for bc in BACKEND_COMPONENTS:
+        if not hasattr(DispatchKey, fk + bc):
+            r = codegen_per_backend_entries()
+            print(r)
+            raise RuntimeError(
+                f"Missing {fk}{bc} from DispatchKey enum.  Here is the autogenerated list we expect to have:\n\n{r}"
+            )
+
+
+STRUCTURED_DISPATCH_KEYS = {DispatchKey.MPS, DispatchKey.CUDA, DispatchKey.CPU}
+UFUNC_DISPATCH_KEYS = {DispatchKey.CUDA, DispatchKey.CPU}
+
+# Set of supported dispatch keys
+dispatch_keys = [
+    DispatchKey.CPU,
+    DispatchKey.SparseCPU,
+    DispatchKey.SparseCsrCPU,
+    DispatchKey.MkldnnCPU,
+    DispatchKey.CUDA,
+    DispatchKey.MPS,
+    DispatchKey.SparseCUDA,
+    DispatchKey.SparseCsrCUDA,
+    DispatchKey.QuantizedCPU,
+    DispatchKey.QuantizedCUDA,
+    DispatchKey.CompositeImplicitAutograd,
+    DispatchKey.CompositeImplicitAutogradNestedTensor,
+    DispatchKey.CompositeExplicitAutograd,
+    DispatchKey.CompositeExplicitAutogradNonFunctional,
+    DispatchKey.NestedTensorCPU,
+    DispatchKey.NestedTensorCUDA,
+    # Meta is a magic key: it is automatically generated for structured
+    # kernels
+    DispatchKey.Meta,
+    DispatchKey.SparseMeta,
+    DispatchKey.SparseCsrMeta,
+    DispatchKey.QuantizedMeta,
+    DispatchKey.NestedTensorMeta,
+    DispatchKey.ZeroTensor,
+]
+
+
+# Dispatch keys that "support all backends".  These codegen slightly differently
+# then backend specific keys.
+def is_generic_dispatch_key(dk: DispatchKey) -> bool:
+    return dk in {
+        DispatchKey.CompositeExplicitAutograd,
+        DispatchKey.CompositeExplicitAutogradNonFunctional,
+        DispatchKey.CompositeImplicitAutograd,
+        DispatchKey.CompositeImplicitAutogradNestedTensor,
+    }
+
+
+# CUDA specific dispatch keys
+def is_cuda_dispatch_key(dk: DispatchKey) -> bool:
+    return dk in {
+        DispatchKey.CUDA,
+        DispatchKey.QuantizedCUDA,
+        DispatchKey.SparseCUDA,
+        DispatchKey.SparseCsrCUDA,
+        DispatchKey.NestedTensorCUDA,
+        DispatchKey.AutogradCUDA,
+    }
+
+
+# Structured kernel generation is only supported for certain key types;
+# otherwise use old-style
+def is_structured_dispatch_key(dk: DispatchKey) -> bool:
+    return dk in STRUCTURED_DISPATCH_KEYS
+
+
+def is_ufunc_dispatch_key(dk: DispatchKey) -> bool:
+    # For now, ufunc dispatch keys coincide with structured keys
+    return dk in UFUNC_DISPATCH_KEYS
+
+
+# This is oddly named ScalarType and not DType for symmetry with C++
+class ScalarType(Enum):
+    Byte = auto()
+    Char = auto()
+    Short = auto()
+    Int = auto()
+    Long = auto()
+    Half = auto()
+    Float = auto()
+    Double = auto()
+    ComplexHalf = auto()
+    ComplexFloat = auto()
+    ComplexDouble = auto()
+    Bool = auto()
+    BFloat16 = auto()
+    Float8_e5m2 = auto()
+    Float8_e5m2fnuz = auto()
+    Float8_e4m3fn = auto()
+    Float8_e4m3fnuz = auto()
+
+    def __str__(self) -> str:
+        return self.name
+
+    @staticmethod
+    def maybe_parse(value: str) -> Optional["ScalarType"]:
+        for k, v in ScalarType.__members__.items():
+            if k == value:
+                return v
+        return None
+
+    @staticmethod
+    def parse(value: str) -> "ScalarType":
+        mb_r = ScalarType.maybe_parse(value)
+        assert mb_r is not None, f"unknown dtype {value}"
+        return mb_r
+
+    @staticmethod
+    def parse_set(values: str) -> OrderedSet["ScalarType"]:
+        dtypes: OrderedSet[ScalarType] = OrderedSet()
+        for value in values.split(", "):
+            if value in DTYPE_CLASSES:
+                dtypes.update(DTYPE_CLASSES[value])
+            else:
+                dtypes.add(ScalarType.parse(value))
+        return dtypes
+
+
+DTYPE_CLASSES: Dict[str, OrderedSet[ScalarType]] = {}
+# NB: Integral doesn't include boolean
+DTYPE_CLASSES["Integral"] = OrderedSet(
+    [
+        ScalarType.Byte,
+        ScalarType.Char,
+        ScalarType.Int,
+        ScalarType.Long,
+        ScalarType.Short,
+    ]
+)
+# NB: Floating doesn't include low precision types
+DTYPE_CLASSES["Floating"] = OrderedSet([ScalarType.Float, ScalarType.Double])
+DTYPE_CLASSES["Complex"] = OrderedSet(
+    [ScalarType.ComplexFloat, ScalarType.ComplexDouble]
+)
+DTYPE_CLASSES["All"] = DTYPE_CLASSES["Integral"] | DTYPE_CLASSES["Floating"]
+DTYPE_CLASSES["AllAndComplex"] = DTYPE_CLASSES["All"] | DTYPE_CLASSES["Complex"]
+DTYPE_CLASSES["FloatingAndComplex"] = (
+    DTYPE_CLASSES["Floating"] | DTYPE_CLASSES["Complex"]
+)
+
+
+# Represents the valid entries for ufunc_inner_loop in native_functions.yaml.
+# NB: if you add a new UfuncKey, you will teach torchgen.dest.ufunc how
+# to process it.  Most logic will ignore keys they don't understand, so your
+# new key will get silently ignored until you hook in logic to deal with it.
+class UfuncKey(Enum):
+    # These are low level keys that represent exactly one particular
+    # instantiation of the kernel produced by codegen
+    CUDAFunctor = auto()
+    CUDAFunctorOnOther = auto()
+    CUDAFunctorOnSelf = auto()
+
+    CPUScalar = auto()
+    CPUVector = auto()
+
+    # These are the ones users will usually specify, and
+    # implicitly "fill in" the low level keys
+    ScalarOnly = auto()  # CUDA*, CPUScalar
+    Generic = auto()  # CUDA*, CPU*
+
+    def __str__(self) -> str:
+        return self.name
+
+    @staticmethod
+    def parse(value: str) -> "UfuncKey":
+        for k, v in UfuncKey.__members__.items():
+            if k == value:
+                return v
+        raise AssertionError(f"unknown ufunc key {value}")
+
+
+class DeviceCheckType(Enum):
+    NoCheck = 0
+    ExactSame = 1
+
+
+class ViewSchemaKind(Enum):
+    aliasing = auto()
+    aliasing_inplace = auto()
+    non_aliasing = auto()
+
+
+# The basic input to the code generation is native_functions.yaml.
+# The name "native", BTW, comes from the distinction between native
+# functions and legacy TH functions.  The legacy TH functions are gone,
+# but the "native" descriptor has stuck.
+#
+# NativeFunction models a single entry in native_functions.yaml.  Its
+# fields roughly correspond to what you would see in the YAML itself,
+# but after canonicalization and parsing has occurred.
+#
+# You can see some of the overall design patterns for how we setup
+# dataclasses in this class, but we will defer a complete discussion
+# of this at FunctionSchema.
+@dataclass(frozen=True)
+class NativeFunction:
+    # The namespace for this operator. For example, if we have "at::add"
+    # then the namespace would be "at". This enables ops to be registered
+    # through the same DSL with a custom namespace. If not specified, the
+    # default namespace would be "at".
+    namespace: str
+
+    # The function schema of the operator in question.  This schema
+    # has been parsed; see FunctionSchema for more about its structure.
+    # (This type is quoted as we are forward referencing a type
+    # defined later in the file.  I opted for this ordering of the
+    # classes for expository clarity.)
+    func: "FunctionSchema"
+
+    # Whether or not to generate mutable tensor arguments like regular
+    # ones
+    use_const_ref_for_mutable_tensors: bool
+
+    # Whether or not to omit automatic generation of a DeviceGuard
+    device_guard: bool
+
+    # How to emit automatic generation of device check
+    device_check: DeviceCheckType
+
+    # What python module to put the function in
+    python_module: Optional[str]
+
+    # TODO: figure out what this does
+    category_override: Optional[str]
+
+    # If no variants are specified in native_functions.yaml, this is
+    # assumed to be {'function'}.
+    variants: Set[Variant]
+
+    # Whether or not we should skip generating registrations for
+    # this kernel.  This is a bit of a double-edged sword, as manual
+    # registrations don't participate in codegen-based selective build!
+    manual_kernel_registration: bool
+
+    # Whether or not to skip generating TensorMethod/Functions bindings
+    # for this kernel.  Technically, this doesn't actually skip generating
+    # the binding; instead, the binding gets generated to __dispatch_{funcname}
+    # so you can make use of the normal binding if you need it.
+    manual_cpp_binding: bool
+
+    # The location in the YAML file were this native function entry was
+    # defined.  This is for conveniently reporting error messages!
+    loc: "Location"
+
+    # A list of operators that are expected to be auto-generated for this NativeFunction.
+    # Note: This list isn't actually directly used by the codegen to generate anything.
+    # Instead, the codegen figures out what operators to generate purely based off of
+    # function schema, and uses the autogen declarations to error check.
+    # We expect every NativeFunction that gets auto-generated be explicitly called out
+    # in native_functions.yaml
+    autogen: List["OperatorName"]
+
+    # If non-empty, this kernel is subject to ufunc codegen.
+    # Sorted by ufunc_key
+    ufunc_inner_loop: Dict[UfuncKey, "UfuncInnerLoop"]
+
+    # Whether or not this out functions is a "structured kernel".  Structured
+    # kernels are defined a little differently from normal kernels; in
+    # particular, their shape checking logic is defined separately from
+    # the kernel.  Only out functions can be structured; other functions
+    # delegate to the out function using the structured_delegate keyword.
+    # Every structured kernel must have at least an out and a functional
+    # variant.
+    structured: bool
+
+    # Whether or not this non-out function is a structured kernel, defined
+    # in terms of the out kernel referenced by the string here.
+    structured_delegate: Optional["OperatorName"]
+
+    # Only valid for structured kernels.  Specifies alternative of what
+    # to inherit from when defining the meta class for the structured
+    # operator.  This will usually be TensorIteratorBase.  This also
+    # changes the semantics of set_output to call the parent class.
+    structured_inherits: Optional[str]
+
+    # Structured kernels can declare elements as "precomputed". These elements
+    # are returned by the meta function in one struct and passed to the impl
+    # function in lieu of certain kernel arguments that these precomputed
+    # elements supersede. Information about the names and types of these
+    # precomputed elements and how they correspond to kernel arguments is stored
+    # in this member, if applicable.
+    precomputed: Optional["Precompute"]
+
+    # Argument names whose default  should be excluded from the C++ interface.
+    # Intended for resolving overload ambiguities between signatures.
+    cpp_no_default_args: Set[str]
+
+    # Note [Abstract ATen methods]
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # An abstract ATen method is one whose dispatch differs between
+    # types.  These are implemented in derived types (with a
+    # standard (throwing) definition in Type).  A concrete ATen
+    # method is one which has the same dispatch for all types;
+    # we just implement it in the base Type.  This is exposed
+    # in Declarations.yaml via a field named 'abstract'.
+    is_abstract: bool
+
+    # Whether or not the NativeFunction contains a backend-agnostic kernel
+    has_composite_implicit_autograd_kernel: bool
+    has_composite_implicit_autograd_nested_tensor_kernel: bool
+    has_composite_explicit_autograd_kernel: bool
+    has_composite_explicit_autograd_non_functional_kernel: bool
+
+    # Tags are used to describe semantic information about (groups of) operators,
+    # That aren't easily inferrable directly from the operator's schema.
+    tags: Set[str]
+
+    # NB: The benefit of defining a dataclass is that we automatically get
+    # a constructor defined for all the fields we specify.  No need
+    # to explicitly write it out.
+
+    # We parse both the NativeFunction + backend-specific information about it, which it stored in a corresponding BackendIndex.
+    @staticmethod
+    def from_yaml(
+        ei: Dict[str, object],
+        loc: "Location",
+        valid_tags: Set[str],
+        ignore_keys: Optional[Set[DispatchKey]] = None,
+    ) -> Tuple[
+        "NativeFunction", Dict[DispatchKey, Dict["OperatorName", "BackendMetadata"]]
+    ]:
+        """
+        Parse a NativeFunction from a dictionary as directly parsed
+        from native_functions.yaml
+        """
+        e = ei.copy()
+
+        funcs = e.pop("func")
+        assert isinstance(funcs, str), f"not a str: {funcs}"
+        # only support one level of namespace. E.g., aten::add
+        namespace_helper = NamespaceHelper.from_namespaced_entity(
+            namespaced_entity=funcs, max_level=1
+        )
+        namespace = namespace_helper.get_cpp_namespace(default="aten")
+        func = FunctionSchema.parse(namespace_helper.entity_name)
+
+        cpp_no_default_args_list = e.pop("cpp_no_default_args", [])
+        assert isinstance(cpp_no_default_args_list, list)
+        cpp_no_default_args = set(cpp_no_default_args_list)
+
+        use_const_ref_for_mutable_tensors = e.pop(
+            "use_const_ref_for_mutable_tensors", False
+        )
+        assert isinstance(use_const_ref_for_mutable_tensors, bool)
+
+        variants_s = e.pop("variants", "function")
+        assert isinstance(variants_s, str)
+        variants: Set[Variant] = set()
+        for v in variants_s.split(", "):
+            if v == "function":
+                variants.add(Variant.function)
+            elif v == "method":
+                variants.add(Variant.method)
+            else:
+                raise AssertionError(f"illegal variant {v}")
+
+        manual_kernel_registration = e.pop("manual_kernel_registration", False)
+        assert isinstance(
+            manual_kernel_registration, bool
+        ), f"not a bool: {manual_kernel_registration}"
+
+        manual_cpp_binding = e.pop("manual_cpp_binding", False)
+        assert isinstance(manual_cpp_binding, bool), f"not a bool: {manual_cpp_binding}"
+
+        device_guard = e.pop("device_guard", True)
+        assert isinstance(device_guard, bool), f"not a bool: {device_guard}"
+
+        device_check_s = e.pop("device_check", None)
+        assert device_check_s is None or isinstance(
+            device_check_s, str
+        ), f"not a str: {device_check_s}"
+        device_check: DeviceCheckType
+        if device_check_s is None:
+            device_check = DeviceCheckType.ExactSame
+        else:
+            device_check = DeviceCheckType[device_check_s]
+
+        structured = e.pop("structured", False)
+        assert isinstance(structured, bool), f"not a bool: {structured}"
+
+        structured_delegate_s = e.pop("structured_delegate", None)
+        assert structured_delegate_s is None or isinstance(
+            structured_delegate_s, str
+        ), f"not a str: {structured_delegate_s}"
+        assert structured_delegate_s is None or "::" not in structured_delegate_s, (
+            "namespace is not supported in structured delegate,"
+            " using the same namespace as the native function"
+        )
+        structured_delegate: Optional[OperatorName] = None
+        if structured_delegate_s is not None:
+            structured_delegate = OperatorName.parse(structured_delegate_s)
+
+        structured_inherits = e.pop("structured_inherits", None)
+        assert structured_inherits is None or isinstance(
+            structured_inherits, str
+        ), f"not a str: {structured_inherits}"
+        assert structured_inherits is None or "::" not in structured_inherits, (
+            "namespace is not supported in structured inherits,"
+            " using the same namespace as the native function"
+        )
+
+        python_module = e.pop("python_module", None)
+        assert python_module is None or isinstance(
+            python_module, str
+        ), f"not a str: {python_module}"
+        assert (
+            python_module is None or Variant.method not in variants
+        ), "functions in modules cannot be methods"
+
+        category_override = e.pop("category_override", None)
+        assert category_override is None or isinstance(
+            category_override, str
+        ), f"not a str: {category_override}"
+
+        precomputed_dict = e.pop("precomputed", None)
+        assert precomputed_dict is None or structured is True
+        precomputed = Precompute.parse(precomputed_dict) if precomputed_dict else None
+
+        tags_inp = e.pop("tags", [])
+        if isinstance(tags_inp, str):
+            tags_inp = [tags_inp]
+        assert isinstance(tags_inp, list)
+
+        # All aten ops generated by torchgen receive the pt2_compliant tag.
+        if namespace == "aten" and "pt2_compliant_tag" in valid_tags:
+            tags_inp.append("pt2_compliant_tag")
+
+        tags: Set[str] = set()
+        for t in tags_inp:
+            assert len(valid_tags) > 0
+            # TODO: verify that the tag is valid and has an entry in tags.yaml
+            if t in valid_tags:
+                tags.add(t)
+            else:
+                raise AssertionError(f"illegal tag {t}")
+
+        from torchgen.api import cpp
+
+        raw_dispatch = e.pop("dispatch", None)
+        assert raw_dispatch is None or isinstance(raw_dispatch, dict), e
+        dispatch: Dict[DispatchKey, BackendMetadata] = {}
+        num_dispatch_keys: int = 0
+        if raw_dispatch is not None:
+            assert not manual_kernel_registration, (
+                "cannot specify both manual_kernel_registration and dispatch; with "
+                "manual registration, dispatch has no effect!"
+            )
+            redundant_composite_implicit_autograd = False
+            for ks, v in raw_dispatch.items():
+                if ks == "__line__":
+                    continue  # not worth tracking line numbers for dispatch entries
+                assert isinstance(ks, str), e
+                for k in ks.split(","):
+                    dispatch_key = DispatchKey.parse(k.strip())
+                    num_dispatch_keys += 1
+
+                    if ignore_keys and dispatch_key in ignore_keys:
+                        continue
+                    assert dispatch_key in dispatch_keys, (
+                        f"Dispatch key {dispatch_key} of kernel {v} "
+                        "is not a supported dispatch key."
+                    )
+                    # We only allow at most 3 levels of namespace for kernels.
+                    # We will append "native" to a custom kernel namespace.
+                    namespace_helper = NamespaceHelper.from_namespaced_entity(
+                        v, max_level=3
+                    )
+                    kernel_namespace = namespace_helper.get_cpp_namespace(default="at")
+                    # Why is 'structured' included? External backends (e.g.
+                    # XLA) opt into which ops are structured independently
+                    # of which in-tree ops are structured
+                    dispatch[dispatch_key] = BackendMetadata(
+                        kernel=namespace_helper.entity_name,
+                        structured=structured
+                        and is_structured_dispatch_key(dispatch_key),
+                        cpp_namespace=(kernel_namespace + "::native"),
+                    )
+                    if (
+                        dispatch_key is DispatchKey.CompositeImplicitAutograd
+                        and v == cpp.name(func)
+                    ):
+                        redundant_composite_implicit_autograd = True
+
+            # We count the number of dispatch keys which have not been ignored to prevent a dispatch table
+            # in which all backend keys are ignored but necessarily kept, remaining compositeimplicit,
+            # from being treated as redundant.
+            assert not (
+                num_dispatch_keys == 1 and redundant_composite_implicit_autograd
+            ), (
+                "unnecessary dispatch table for this function; just delete the dispatch "
+                "key entirely"
+            )
+            # if a function is a structured delegate, deleting the dispatch
+            # table is NOT semantics preserving
+            assert (
+                structured_delegate
+                or dispatch.keys() != {DispatchKey.CompositeImplicitAutograd}
+                or dispatch[DispatchKey.CompositeImplicitAutograd].supports_symint()
+                or num_dispatch_keys != 1
+            ), (
+                f"unexpected name for singleton CompositeImplicitAutograd dispatch entry: expected {cpp.name(func)} "
+                f"but got {dispatch[DispatchKey.CompositeImplicitAutograd]}.  Rename your implementation to the expected "
+                "name, then delete the dispatch table"
+            )
+        elif not structured and structured_delegate is None:
+            name = str(func.name.name)
+            assert not (
+                name.startswith("new_")
+                or name.endswith("_like")
+                # TODO: maybe it's better to test the return
+                or (
+                    func.arguments.tensor_options
+                    and not func.arguments.has_tensor_arg()
+                )
+            ), (
+                f"expected {name} to have a CompositeExplicitAutograd "
+                "dispatch entry, but there was no dispatch table.  Factory functions "
+                "should not have implicit dispatch as they should not be decomposed "
+                "for __torch_dispatch__"
+            )
+            dispatch[DispatchKey.CompositeImplicitAutograd] = BackendMetadata(
+                cpp.name(func), structured=False, cpp_namespace=DEFAULT_KERNEL_NAMESPACE
+            )
+
+        composites_in_dispatch = [
+            d
+            for d in dispatch
+            if d == DispatchKey.CompositeExplicitAutograd
+            or d == DispatchKey.CompositeExplicitAutogradNonFunctional
+            or d == DispatchKey.CompositeImplicitAutograd
+            or d == DispatchKey.CompositeImplicitAutogradNestedTensor
+        ]
+
+        assert len(composites_in_dispatch) <= 1 or (
+            len(composites_in_dispatch) == 2
+            and (
+                DispatchKey.CompositeExplicitAutogradNonFunctional
+                not in composites_in_dispatch
+            )
+            and (
+                DispatchKey.CompositeImplicitAutogradNestedTensor
+                in composites_in_dispatch
+            )
+        ), (
+            "cannot specify more than one of CompositeExplicitAutograd, CompositeExplicitAutogradNonFunctional, "
+            "or CompositeImplicitAutograd on a single kernel; each "
+            "strictly subsumes the other.  If you wanted to provide an explicit autograd "
+            "implementation, specify CompositeExplicitAutograd; otherwise specify CompositeImplicitAutograd only"
+        )
+
+        autogen_str = e.pop("autogen", "")
+        assert isinstance(autogen_str, str)
+        autogen = (
+            []
+            if autogen_str == ""
+            else [OperatorName.parse(x) for x in autogen_str.split(", ")]
+        )
+
+        raw_ufunc_inner_loop = e.pop("ufunc_inner_loop", {})
+        ufunc_inner_loop = {}
+        if isinstance(raw_ufunc_inner_loop, str):
+            ufunc_inner_loop[UfuncKey.Generic] = UfuncInnerLoop.parse(
+                raw_ufunc_inner_loop, UfuncKey.Generic
+            )
+        elif isinstance(raw_ufunc_inner_loop, dict):
+            for k, vo in raw_ufunc_inner_loop.items():
+                if k == "__line__":
+                    continue
+                assert isinstance(k, str), f"ufunc_inner_loop key is not a str: {k}"
+                assert isinstance(vo, str), f"ufunc_inner_loop value is not a str: {v}"
+                ufunc_key = UfuncKey.parse(k)
+                ufunc_inner_loop[ufunc_key] = UfuncInnerLoop.parse(vo, ufunc_key)
+        else:
+            raise AssertionError(
+                f"ufunc_inner_loop not str or dict: {raw_ufunc_inner_loop}"
+            )
+        # Program the BackendIndex for the implicit dispatch entry from ufunc
+        if ufunc_inner_loop:
+            assert structured, "ufunc must be structured"
+
+            # Delay import ufunc here to avoid circular import issue
+            # See: https://github.com/pytorch/pytorch/issues/81294
+            import torchgen.api.ufunc as ufunc
+
+            for dispatch_key in UFUNC_DISPATCH_KEYS:
+                assert (
+                    dispatch_key not in dispatch
+                ), f"ufunc should not have explicit dispatch entry for {dispatch_key}"
+                dispatch[dispatch_key] = BackendMetadata(
+                    kernel=ufunc.schema_kernel_name(func, dispatch_key),
+                    structured=True,
+                    cpp_namespace=DEFAULT_KERNEL_NAMESPACE,
+                )
+
+        if structured_delegate:
+            # Structured functions MUST have a dispatch table
+            is_abstract = True
+        else:
+            is_abstract = (
+                dispatch.keys() != {DispatchKey.CompositeImplicitAutograd}
+                and dispatch.keys()
+                != {DispatchKey.CompositeImplicitAutogradNestedTensor}
+                and dispatch.keys()
+                != {
+                    DispatchKey.CompositeImplicitAutograd,
+                    DispatchKey.CompositeImplicitAutogradNestedTensor,
+                }
+            )
+
+        has_composite_implicit_autograd_kernel = (
+            DispatchKey.CompositeImplicitAutograd in dispatch.keys()
+        )
+        has_composite_implicit_autograd_nested_tensor_kernel = (
+            DispatchKey.CompositeImplicitAutogradNestedTensor in dispatch.keys()
+        )
+        has_composite_explicit_autograd_kernel = (
+            DispatchKey.CompositeExplicitAutograd in dispatch.keys()
+        )
+        has_composite_explicit_autograd_non_functional_kernel = (
+            DispatchKey.CompositeExplicitAutogradNonFunctional in dispatch.keys()
+        )
+
+        # We aren't going to store dispatch metadata inline in NativeFunctions;
+        # instead it is separately indexed by backend (so other backends can
+        # add more dispatch entries after the fact).  Reindex the individual
+        # metadata by OperatorName!
+        backend_metadata = {k: {func.name: v} for k, v in dispatch.items()}
+
+        # don't care if it exists or not; make it easier to use this function
+        # with other yaml parsers that aren't setting __line__ in the dict
+        e.pop("__line__", None)
+        assert not e, f"leftover entries: {e}"
+
+        # Asserts that we can't do in post_init, because they rely on backend-specific info
+        if structured_delegate is not None:
+            for key in STRUCTURED_DISPATCH_KEYS:
+                assert key not in dispatch, (
+                    f"if structured_delegate, then must not have {key} in dispatch dictionary "
+                    "(it is delegated!)"
+                )
+
+        return (
+            NativeFunction(
+                func=func,
+                use_const_ref_for_mutable_tensors=use_const_ref_for_mutable_tensors,
+                variants=variants,
+                structured=structured,
+                structured_delegate=structured_delegate,
+                structured_inherits=structured_inherits,
+                precomputed=precomputed,
+                autogen=autogen,
+                ufunc_inner_loop=ufunc_inner_loop,
+                manual_kernel_registration=manual_kernel_registration,
+                manual_cpp_binding=manual_cpp_binding,
+                python_module=python_module,
+                category_override=category_override,
+                device_guard=device_guard,
+                device_check=device_check,
+                loc=loc,
+                cpp_no_default_args=cpp_no_default_args,
+                is_abstract=is_abstract,
+                has_composite_implicit_autograd_kernel=has_composite_implicit_autograd_kernel,
+                has_composite_implicit_autograd_nested_tensor_kernel=has_composite_implicit_autograd_nested_tensor_kernel,
+                has_composite_explicit_autograd_kernel=has_composite_explicit_autograd_kernel,
+                has_composite_explicit_autograd_non_functional_kernel=has_composite_explicit_autograd_non_functional_kernel,
+                tags=tags,
+                namespace=namespace,
+            ),
+            backend_metadata,
+        )
+
+    def validate_unstructured(self) -> None:
+        # TODO: probably better to accumulate these errors and report them all
+        # at once
+        assert not self.structured, (
+            "This function is structured, but there was "
+            "no valid functional variant of it."
+        )
+        assert self.structured_delegate, (
+            "This function delegates to another structured out function, "
+            "but no valid function was found (the delegate may not exist, or it has the wrong type)"
+        )
+
+    # __post_init__ functions in dataclasses can be used to do extra
+    # validation after construction.
+    #
+    # Notice that we don't do any type validation here.  In fact, we
+    # rely exclusively on mypy to check if you've done types correctly!
+    # Validation is for nontrivial invariants that cannot be (conveniently)
+    # encoded in the type system.
+    def __post_init__(self) -> None:
+        if self.func.arguments.out:
+            assert self.variants == {Variant.function}, (
+                "Native functions with out arguments MUST "
+                "be declared with only function variant; e.g., variants: function; "
+                "otherwise you will tickle a Python argument binding bug "
+                "(which usually manifests itself as the result variable being undefined.)"
+            )
+        if self.structured:
+            assert self.func.kind() == SchemaKind.out, (
+                "Put structured field on the out= "
+                "variant of a function; did you mean structured_delegate?"
+            )
+            assert (
+                self.device_guard
+            ), "device_guard: False is not respected by structured kernels"
+        if self.structured_delegate:
+            assert self.func.kind() != SchemaKind.out, (
+                "structured_delegate field not allowed "
+                "on out= functions; did you mean structured?"
+            )
+            assert (
+                self.device_guard
+            ), "device_guard: False is not respected by structured kernels"
+        # Technically, with the asserts above, this assert is impossible to
+        # happen
+        assert not (
+            self.structured and self.structured_delegate
+        ), "Cannot have both structured and structured_delegate on function"
+        defaulted_arguments = {
+            a.name for a in self.func.schema_order_arguments() if a.default is not None
+        }
+        invalid_args = set.difference(self.cpp_no_default_args, defaulted_arguments)
+        assert len(invalid_args) == 0, f"Invalid cpp_no_default_args: {invalid_args}"
+        if self.structured_inherits is not None:
+            assert (
+                self.structured
+            ), "structured_inherits must also imply structured: True"
+        if str(self.func.name).startswith("_foreach"):
+            assert self.device_check == DeviceCheckType.NoCheck, (
+                "foreach kernels fall back to slow path when tensor are on different devices, "
+                "device_check not allowed to be enabled"
+            )
+
+        # NB: if your function accidentally has rand/dropout/... in its name
+        # but is not actually random, feel free to amend this to special case
+        if (
+            "rand" in str(self.func.name)
+            or (
+                (
+                    "dropout" in str(self.func.name)
+                    or any(
+                        "dropout" in arg.name for arg in self.func.arguments.flat_all
+                    )
+                )
+                # Backwards of dropout is typically deterministic
+                and "backward" not in str(self.func.name)
+                and str(self.func.name.name) not in ["_cudnn_init_dropout_state"]
+            )
+            or self.func.arguments.has_generator_arg()
+        ):
+            assert "nondeterministic_seeded" in self.tags, str(self.func.name)
+
+    @property
+    def has_composite_kernel(self) -> bool:
+        return (
+            self.has_composite_implicit_autograd_kernel
+            or self.has_composite_explicit_autograd_kernel
+            or self.has_composite_explicit_autograd_non_functional_kernel
+        ) or (
+            self.has_composite_implicit_autograd_kernel
+            and self.has_composite_implicit_autograd_nested_tensor_kernel
+        )
+
+    @property
+    def is_view_op(self) -> bool:
+        rets = self.func.returns
+        is_non_mutating_view = len(rets) > 0 and any(
+            r.annotation is not None and not r.annotation.is_write for r in rets
+        )
+        # See Note [resize_ in Functionalization] for more dtails
+        is_inplace_view = (
+            "inplace_view" in self.tags
+            and str(self.func.name) != "resize_"
+            and str(self.func.name) != "resize_as_"
+        )
+        is_wildcard_view = any(
+            inp.annotation is not None and "*" in inp.annotation.alias_set_after
+            for inp in self.func.schema_order_arguments()
+        )
+        return is_non_mutating_view or is_inplace_view or is_wildcard_view
+
+    @property
+    def view_schema_kind(self) -> ViewSchemaKind:
+        if self.is_view_op and self.func.name.name.inplace:
+            assert "inplace_view" in self.tags
+            return ViewSchemaKind.aliasing_inplace
+        if self.is_view_op:
+            return ViewSchemaKind.aliasing
+        else:
+            return ViewSchemaKind.non_aliasing
+
+    @property
+    def root_name(self) -> str:
+        return self.func.name.name.base
+
+    @property
+    def part_of_structured_group(self) -> bool:
+        return self.structured or self.structured_delegate is not None
+
+
+class SchemaKind(Enum):
+    functional = auto()
+    inplace = auto()
+    out = auto()
+    mutable = auto()
+    scratch = auto()
+
+
+# A structured kernel is guaranteed to have a functional and out variant, and
+# optionally an inplace variant.
+#
+# NB: we create NativeFunctionsGroup *even if* the function is not
+# actually annotated structured.  Test the structured boolean to see if it
+# actually is structured or not.
+@dataclass(frozen=True)
+class NativeFunctionsGroup:
+    functional: NativeFunction
+    inplace: Optional[NativeFunction]
+    mutable: Optional[NativeFunction]
+    out: NativeFunction
+
+    @property
+    def structured(self) -> bool:
+        # Whether or not the operator has a meta() function. This information is backend-agnostic.
+        return self.out.structured
+
+    def __post_init__(self) -> None:
+        test_sig: FunctionSchema = self.functional.func.signature()
+        for f in self.functions():
+            if test_sig != f.func.signature():
+                raise AssertionError(
+                    "NativeFunctionsGroup constructed from two NativeFunctions "
+                    f"that don't have matching signatures: {test_sig} != {f.func.signature()}"
+                )
+
+            if self.structured != f.part_of_structured_group:
+                raise AssertionError(
+                    "NativeFunctionsGroup constructed from structured and unstructured "
+                    f"functions: {self.out.func.name} and {f.func.name}"
+                )
+        assert self.functional.func.kind() == SchemaKind.functional
+        assert self.out.func.kind() == SchemaKind.out
+        assert self.functional.namespace == self.out.namespace
+        if self.inplace is not None:
+            assert self.inplace.func.kind() == SchemaKind.inplace
+            assert self.inplace.namespace == self.functional.namespace
+
+        if self.mutable is not None:
+            assert self.mutable.func.kind() == SchemaKind.mutable
+            assert self.mutable.namespace == self.functional.namespace
+            # See Note [Overload Ambiguity With Functional Variants]
+            assert self.functional.func.name.name.functional_overload
+
+        if self.structured:
+            # For now, structured composite kernels are not supported (need some
+            # design work to figure out how to make the composite case work)
+            assert (
+                not self.out.has_composite_implicit_autograd_kernel
+                and not self.out.has_composite_implicit_autograd_nested_tensor_kernel
+            )
+
+            assert self.functional.structured_delegate == self.out.func.name, (
+                f"{self.functional.func.name} delegates to {self.functional.structured_delegate} "
+                f"but its actual delegate is {self.out.func.name}"
+            )
+            if self.inplace is not None:
+                assert self.inplace.structured_delegate == self.out.func.name
+
+        generated_fns = sorted(
+            [str(f.func.name) for f in self.functions() if "generated" in f.tags]
+        )
+        generated_fns_str = ", ".join(str(x) for x in generated_fns)
+        expected_generated_fns: Set[str] = set()
+        for f in self.functions():
+            expected_generated_fns.update(str(op) for op in f.autogen)
+        expected_generated_fns_str = ", ".join(
+            str(x) for x in sorted(expected_generated_fns)
+        )
+        if len(expected_generated_fns) == 0 and len(generated_fns) > 0:
+            raise RuntimeError(
+                f"The codegen expects to be able to generate '{generated_fns_str}'."
+                " In order to generate them however, we expect them to be called out explicitly in the yaml."
+                f" Please add an 'autogen: {generated_fns_str}' line to the entry for {str(f.func.name)}"
+            )
+        if expected_generated_fns_str != generated_fns_str:
+            raise RuntimeError(
+                f"The codegen expects to be able to generate '{generated_fns_str}'."
+                f" To do so, it expects a line: 'autogen: {generated_fns_str}'."
+                f" Instead, it found 'autogen: {expected_generated_fns_str}'"
+            )
+
+    def signature(self) -> "FunctionSchema":
+        return self.out.func.signature()
+
+    def functions(self) -> Iterator[NativeFunction]:
+        yield self.functional
+        yield self.out
+        if self.inplace is not None:
+            yield self.inplace
+        if self.mutable is not None:
+            yield self.mutable
+
+    @property
+    def root_name(self) -> str:
+        return self.functional.root_name
+
+    @staticmethod
+    def from_dict(
+        d: Dict[SchemaKind, NativeFunction]
+    ) -> Optional["NativeFunctionsGroup"]:
+        assert d
+        if len(d) == 1:
+            return None
+        d = dict(d)  # non-destructive updates please
+        functional = d.pop(SchemaKind.functional, None)
+        inplace = d.pop(SchemaKind.inplace, None)
+        mutable = d.pop(SchemaKind.mutable, None)
+        out = d.pop(SchemaKind.out, None)
+        assert not d
+        assert functional is not None
+        # There are a few operators which only have functional/inplace variants;
+        # these don't count as structured for our purposes here
+        if out is None:
+            return None
+        # assuming all variants have the same namespace
+        return NativeFunctionsGroup(
+            functional=functional,
+            inplace=inplace,
+            mutable=mutable,
+            out=out,
+        )
+
+
+@dataclass(frozen=True)
+class BackendMetadata:
+    # The name of the backend kernel, for a given operator
+    # for in-tree backends. These names come directly from the 'dispatch" field
+    # in native_functions.yaml. The dispatch entry is optional; in that
+    # case, that is equivalent to having written:
+    #
+    #   dispatch:
+    #       CompositeImplicitAutograd: $operator_name
+    kernel: str
+    # Whether or not the operator has a structured kernel implemented, for this particular backend.
+    # For in-tree backends, they all have the same value for structured- this is listed
+    # in native_functions.yaml.
+    # However, external backends like XLA can indendently toggle which ops are structured.
+    structured: bool
+
+    # The namespace for kernels, default value: DEFAULT_KERNEL_NAMESPACE
+    cpp_namespace: str
+
+    def supports_symint(self) -> bool:
+        return "_symint" in self.kernel
+
+
+@dataclass(frozen=True)
+class UfuncInnerLoop:
+    name: str
+    supported_dtypes: OrderedSet[ScalarType]
+    # key is stored here because it affects the semantics of name,
+    # so its helpful to have them together for further processing
+    ufunc_key: UfuncKey
+
+    @staticmethod
+    def parse(value: str, ufunc_key: UfuncKey) -> "UfuncInnerLoop":
+        name, supported_dtypes_str = value.split(" ", 1)
+        assert supported_dtypes_str[0] == "("
+        assert supported_dtypes_str[-1] == ")"
+        supported_dtypes: OrderedSet[ScalarType] = OrderedSet()
+        for k in supported_dtypes_str[1:-1].split(", "):
+            supported_dtypes |= ScalarType.parse_set(k)
+        return UfuncInnerLoop(
+            name=name, supported_dtypes=supported_dtypes, ufunc_key=ufunc_key
+        )
+
+
+# BackendIndex represents a backend.
+# The BackendIndex encodes per-operator information that is potentially different
+# for each backend. The most obvious example is the name of the kernel
+# (the 'dispatch' entry in native_functions.yaml).
+# However, there can be other examples of different backends having different information.
+# External backends can choose to opt their kernels to be structured independently from in-tree backends,
+# which means that this information isn't inherently tied to a NativeFunction- it's different per backend.
+@dataclass(frozen=True)
+class BackendIndex:
+    dispatch_key: DispatchKey
+    # Mainly important for structured kernels, this determines which variant in the operator group is used to implement the others.
+    # All in-tree ops use out kernels, while XLA uses functional kernels.
+    use_out_as_primary: bool
+    # Whether the backend requires a device guard, and device checks.
+    # For in-tree backends, this is currently just CUDA/HIP
+    # For out-of-tree backends, this is currently just Intel XPU
+    device_guard: bool
+    # Whether the backend is in-tree (CPU/CUDA) or out-of-tree (XLA)
+    external: bool
+    # Other backend-specific information that is on a per-operator basis
+    index: Dict["OperatorName", BackendMetadata]
+
+    @staticmethod
+    def grow_index(
+        parent_index: Dict[DispatchKey, Dict["OperatorName", BackendMetadata]],
+        child_index: Dict[DispatchKey, Dict["OperatorName", BackendMetadata]],
+    ) -> None:
+        for k, v in child_index.items():
+            for op_name, metadata in v.items():
+                assert (
+                    op_name not in parent_index[k]
+                ), f"duplicate operator {op_name} for dispatch key {k}"
+                parent_index[k][op_name] = metadata
+
+    def primary(self, g: NativeFunctionsGroup) -> NativeFunction:
+        if self.use_out_as_primary:
+            return g.out
+        else:
+            return g.functional
+
+    def has_kernel(self, g: Union[NativeFunction, NativeFunctionsGroup]) -> bool:
+        m = self.get_kernel(g)
+        return m is not None
+
+    def get_kernel(
+        self, g: Union[NativeFunction, NativeFunctionsGroup]
+    ) -> Optional[BackendMetadata]:
+        if isinstance(g, NativeFunction):
+            f = g
+        elif isinstance(g, NativeFunctionsGroup):
+            f = self.primary(g)
+        else:
+            assert_never(g)
+        if f.func.name not in self.index:
+            return None
+        return self.index[f.func.name]
+
+    def native_function_class_name(self) -> Optional[str]:
+        if self.external:
+            return f"{str(self.dispatch_key)}NativeFunctions"
+        else:
+            # TODO: This discrepancy isn't required; we could also generated
+            # a class for in-tree kernels. It'll just require carefully
+            # updating every kernel definition + callsite of every in-tree aten kernel.
+            return None
+
+
+# The function schema is undoubtedly the most important data structure
+# in all of the codegen, as it defines the type signature for operators,
+# and most of the code generation we do is type directed (e.g., look at
+# the types, decide what to do.  Think about how we code generate
+# C++ function stubs!)
+#
+# We will also see in this class the general structure for how we model
+# data in this code generation.  A few notable properties to point out
+# ahead of time:
+#
+#   - These dataclasses are a *lossless* representation of the strings
+#     they are parsed from.  In fact, we assert that given the
+#     information stored in the dataclass, we can exactly reconstruct
+#     the string we parsed from (and assert this inside the parse
+#     definition).  There are a few reasons for this:
+#
+#       - If you find that it is difficult to reconstruct the string
+#         given a dataclass, that is a clue that you are data
+#         representation is wrong.
+#
+#       - It helps ensure that all relevant information is present
+#         in the dataclass, so that downstream users aren't tempted
+#         to reparse the original string to get some information
+#         that was omitted.
+#
+#       - It forces you to represent the data in-memory in the same way
+#         it is recorded textually, which makes the dataclasses easier
+#         to understand for someone who is familiar with the
+#         textual format.  (As a tradeoff, it means you have to model
+#         the syntax, even when it is inconvenient.  But maybe that means
+#         the syntax is bad!)  If you don't understand the internal
+#         representation, go look at the printing code to see how
+#         it maps onto the surface syntax!
+#
+#       - It makes it easy to test the parsing code, as parsing code
+#         that is inconsistent with the string code will fail early
+#         and loudly.  (As a tradeoff, it makes the parsing code a bit
+#         brittle (in particular, with trivial whitespace changes you
+#         are likely to trigger an assert error).
+#
+#     In general, try to make the __str__ code as simple as possible
+#     (even at the cost of more complex parsing logic.)  Additionally,
+#     try to minimize redundancy in data representation.  (Precomputed
+#     fields are OK though: they are defined as a simple function on
+#     the canonical representation in question.)
+#
+#   - These dataclasses are all frozen; once constructed their
+#     values never change.  This makes it easy to tell where any
+#     given data came from: just look to the constructor.  As a
+#     tradeoff, you can't easily "decorate" a schema with extra
+#     information from a post-facto analysis.  We impose this
+#     restriction to make these structures more understandable.
+#
+@dataclass(frozen=True)
+class FunctionSchema:
+    # The name of the operator this function schema describes.
+    name: "OperatorName"
+
+    arguments: "Arguments"
+
+    # TODO: Need to handle collisions with argument names at some point
+    returns: Tuple["Return", ...]
+
+    def schema_order_arguments(self) -> Iterator["Argument"]:
+        return itertools.chain(
+            self.arguments.flat_positional,
+            self.arguments.flat_kwarg_only,
+            self.arguments.out,
+        )
+
+    decl_re = re.compile(r"(?P<name>[^\(]+)\((?P<args>.*)\) -> (?P<returns>.*)")
+
+    @staticmethod
+    def parse(func: str) -> "FunctionSchema":
+        # We should probably get a proper parser here
+        decls = FunctionSchema.decl_re.findall(func)
+        assert len(decls) == 1, f"Invalid function schema: {func}"
+        ops, args, return_decl = decls[0]
+        name = OperatorName.parse(ops)
+        arguments = Arguments.parse(args)
+        returns = parse_returns(return_decl)
+        r = FunctionSchema(name=name, arguments=arguments, returns=returns)
+        assert str(r) == func, f"{str(r)} != {func}"
+        return r
+
+    def returns_are_aliased(self) -> bool:
+        # We assert earlier that schemas can't have a mix of aliased and non-aliased returns
+        return any(
+            r
+            for r in self.returns
+            if r.annotation is not None and r.annotation.is_write
+        )
+
+    def __post_init__(self) -> None:
+        for arg, ret in zip(self.arguments.out, self.returns):
+            assert arg.annotation == ret.annotation, (
+                "Out arguments must have matching return Tensor; furthermore, "
+                "the ith-argument needs to correspond to the ith return"
+            )
+        # We also enforce that if you have any mutable, positional args, then they are not returned.
+        # This makes it easier to group these functions properly with their functional/out= counterparts.
+        for a in self.arguments.post_self_positional_mutable:
+            assert not any(
+                a.annotation == r.annotation for r in self.returns
+            ), f"If you have a schema with mutable positional args, we expect them to not be returned. schema: {str(self)}"
+        # Invariant: we expect out arguments to appear as keyword arguments in the schema.
+        # This means that all mutable returns should be aliased to a keyword argument
+        # (except for "self", which we explicitly don't treat as an out argument because of its use in methods)
+        # See Note [is_out_fn]
+        out_and_self = list(self.arguments.out) + [
+            arg for arg in self.arguments.flat_positional if arg.name == "self"
+        ]
+        mutable_returns = [
+            ret
+            for ret in self.returns
+            if ret.annotation is not None and ret.annotation.is_write
+        ]
+        immutable_returns = [
+            ret
+            for ret in self.returns
+            if ret.annotation is None or not ret.annotation.is_write
+        ]
+        # Some assertions: We don't want any functions with a return type of "-> (Tensor(a!), Tensor)",
+        # because:
+        # (1) It's more annoying to handle properly
+        # (2) It's unnecessary - you can't method-chain on the first (mutated) output because it's part of a tuple.
+        # Instead, we expect the (a!) argument to not be returned.
+        assert (
+            len(mutable_returns) == 0 or len(immutable_returns) == 0
+        ), f"NativeFunctions must have either only mutable returns, or only immutable returns. Found: {str(self)}"
+        for ret in mutable_returns:
+            assert any(ret.annotation == arg.annotation for arg in out_and_self), (
+                'All mutable returns must be aliased either to a keyword argument, or to "self". '
+                "Did you forget to mark an out argument as keyword-only?"
+            )
+        if self.arguments.out:
+            # out= ops that return their mutable inputs are only really useful for method chaining.
+            # And method chaining is only really useful if the thing you're returning is a plain Tensor.
+            # So ideally, we'd enforce that out= ops with a single plain mutable tensor should return the tensor,
+            # and all other types of out= op schemas should return void.
+            # There are a bunch of existing out= ops that return tuples of tensors though, so we're stuck with allowing that.
+            if any(a.type != BaseType(BaseTy.Tensor) for a in self.arguments.out):
+                assert (
+                    len(self.returns) == 0
+                ), "out= ops that accept tensor lists as out arguments "
+                "are expected to have no return type (since you can't do method chaining on them)"
+            else:
+                # mutable keyword arguments whose name has _scratch_ prefix are
+                # scratch tensors for memory planning and should not be returned
+                assert len(
+                    [
+                        arg
+                        for arg in self.arguments.out
+                        if not arg.name.startswith("_scratch_")
+                    ]
+                ) == len(
+                    self.returns
+                ), "Must return as many arguments as there are out arguments, or no return at all"
+
+        if self.name.name.inplace:
+            self_a = self.arguments.self_arg
+            assert (
+                self_a
+                and self_a.argument.annotation
+                and self_a.argument.annotation.is_write
+            )
+            if self_a.argument.type == BaseType(BaseTy.Tensor):
+                # All inplace ops with an ordinary `Tensor self` argument should return self,
+                # to allow for method chaining.
+                assert (
+                    len(self.returns) == 1
+                    and self.returns[0].annotation == self_a.argument.annotation
+                )
+            else:
+                # You can't method chain on non-tensor self arguments though (like a List[Tensor])
+                # so in all other cases we expect the return type to be none.
+                assert len(self.returns) == 0
+
+        if self.arguments.tensor_options is not None:
+            assert self.kind() == SchemaKind.functional, (
+                "Found an operator that is not functional or out variant, but has tensor options arguments."
+                "This is not allowed- tensor options arguments are only allowed for factory functions."
+                f"schema: {str(self)}"
+            )
+        if self.is_functional_fn():
+            assert self.kind() == SchemaKind.functional, (
+                "Found an operator that is not functional, but its overload contains the string 'functional'."
+                "This is a special keyword in the codegen, please use a different overload name."
+                f"schema: {str(self)}"
+            )
+
+    def is_functional_fn(self) -> bool:
+        return "functional" in self.name.overload_name
+
+    def is_out_fn(self) -> bool:
+        # Note [is_out_fn]
+        #
+        # out functions are the variants which take an explicit out= argument
+        # to populate into.  We need to know if a schema corresponds to an
+        # out function for several reasons:
+        #
+        #   - They codegen differently in C++ API
+        #       - codegen to at::add_out rather than at::add
+        #       - out argument is moved to front of C++ argument list
+        #
+        # out functions are DEFINED to be any function with a keyword-only
+        # argument that is mutable.  In principle, this could lead to a
+        # false positive if you define a function that mutates a
+        # kwarg only argument, but this isn't the "true" output of this
+        # function.  A more robust definition that would work in this
+        # case would also look at:
+        #
+        #   - The output types.  Out functions take in the arguments
+        #     they mutate and then return them again; this is sort
+        #     of "definitionally" what makes something an out function.
+        #     Historically, we DO check this for consistency.
+        #   - Correspondence with pure variant.  An out function
+        #     should have a signature equivalent to its pure variant,
+        #     but just with extra kwargs for the output elements.  This
+        #     is difficult to actually check for and historically
+        #     we only do this check in tools/
+        return bool(self.arguments.out)
+
+    def kind(self) -> SchemaKind:
+        """
+        What kind of schema is this?  A functional schema is one
+        that returns a newly allocated output; an inplace schema
+        modifies the self argument inplace; an out schema writes
+        the result into an explicitly provided out argument.
+        """
+        is_out = bool(self.arguments.out)
+        is_scratch = bool(
+            [arg for arg in self.arguments.out if arg.name.startswith("_scratch_")]
+        )
+        is_inplace = self.name.name.inplace
+        is_mutable = any(
+            a.annotation is not None and a.annotation.is_write
+            for a in self.arguments.post_self_positional
+        )
+        assert not (is_out and is_inplace)
+        # out= and inplace schemas can also have post_self_positional mutable args,
+        # but we give precedence to out= and inplace when deciding the schema kind.
+        # Tradeoff: we probably don't want to have to teach codegen that looks at inplace ops
+        # to also worry about mutable post_self_positional arguments,
+        # but it seems like a much bigger lift to classify them has having a new schema kind.
+        # The number of ops that fit in this strange category is small enough that
+        # we can probably manually write code for them instead of forcing the codegen to handle them.
+        if is_inplace:
+            return SchemaKind.inplace
+        elif is_scratch:
+            assert (
+                is_out
+            ), "invariant: all scratch operators are expected to be out= operators too"
+            return SchemaKind.scratch
+        elif is_out:
+            assert (
+                not is_scratch
+            ), "We should not categorize a scratch op as an out variant. Check if the order of if statements are expected!"
+            return SchemaKind.out
+        elif is_mutable:
+            return SchemaKind.mutable
+        else:
+            return SchemaKind.functional
+
+    # For every return:
+    # - If the return aliases an input, we return the input name
+    # - Otherwise, we return None.
+    # If return names were enforced to be consistent with aliasing information, then we wouldn't need this.
+    def aliased_return_names(self) -> List[Optional[str]]:
+        outs: List[Optional[str]] = []
+        for r in self.returns:
+            aliased_args = [
+                a
+                for a in self.arguments.flat_all
+                if a.annotation is not None and a.annotation == r.annotation
+            ]
+            if len(aliased_args) == 0:
+                outs.append(None)
+            elif len(aliased_args) == 1:
+                outs.append(aliased_args[0].name)
+            else:
+                aliased_names = ", ".join(a.name for a in aliased_args)
+                raise AssertionError(
+                    f"Found a return ({r.name})that aliases multiple inputs ({aliased_names})"
+                )
+        return outs
+
+    def signature(
+        self,
+        *,
+        strip_default: bool = False,
+        strip_view_copy_name: bool = False,
+        keep_return_names: bool = False,
+    ) -> "FunctionSchema":
+        """
+                Certain schemas are 'related', in that they are simply
+                inplace/out/functional versions of the same function.  This method
+                factors these schemas into the "core" functional signature which
+                is equal across all versions.
+
+                Here is what normalization happens to the schema to convert
+                it to a signature:
+                - The overload name is stripped (name is retained, since
+                  it expresses semantic content about what the function does)
+                - Inplace is set False
+                - Out arguments are stripped
+                - Mutable post_self_positional args are converted to returns
+                - Mutability annotations are stripped  (this is sound
+                  because you cannot overload on mutability annotation)
+                - Return names are stripped since they are not overloadable and
+                  some variants have return names but some not
+                - TensorOptions are dropped
+                  because out= variants of factory functions don't include them
+                  (and we want to be able to pair up factory functions with their out variants)
+
+                Finally, we want to be able to pair up related "view" and their
+                corresponding "view_copy" operators. We do this by optionally
+                stripping the trailing "_copy" from the base name.
+
+                Example of a mutable op before and after:
+
+                f.func (Mutable operator):
+        _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask)  # noqa: B950
+
+                f.func (Corresponding functional operator):
+        _fused_moving_avg_obs_fq_helper.functional(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor running_min, Tensor running_max, Tensor scale, Tensor zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask, Tensor running_min_out, Tensor running_max_out, Tensor scale_out, Tensor zero_point_out)  # noqa: B950
+
+                f.func.signature() output:
+        _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor running_min, Tensor running_max, Tensor scale, Tensor zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)  # noqa: B950
+        """
+
+        def strip_ret_annotation(r: Return) -> Return:
+            return Return(
+                name=r.name if keep_return_names else None,
+                type=r.type,
+                annotation=None,
+            )
+
+        base_name = self.name.name.base
+        if strip_view_copy_name:
+            if base_name.endswith("_copy"):
+                base_name = base_name.replace("_copy", "")
+            elif base_name.endswith("_scatter"):
+                base_name = base_name.replace("scatter", "inverse")
+
+        # find mutable inputs that are not originally returned, and convert them to returns
+        returns_from_mutable_inputs = tuple(
+            # When we're grouping functions we strip the return names,
+            # but when we're generating the actual functional variants then we follow
+            # a convention for what to name the returns
+            Return(
+                name=f"{a.name}_out" if keep_return_names else None,
+                type=a.type,
+                annotation=None,
+            )
+            for a in itertools.chain(
+                # Order is important here (otherwise e.g. inplace with mutable args
+                # and out= with mutable args won't have the same signature)
+                [self.arguments.self_arg.argument]
+                if self.arguments.self_arg is not None
+                else [],
+                self.arguments.out,
+                self.arguments.post_self_positional,
+            )
+            if a.annotation is not None
+            and a.annotation.is_write
+            and not any(a.annotation == r.annotation for r in self.returns)
+        )
+        original_returns = tuple(map(strip_ret_annotation, self.returns))
+        # Ordering is important here. We expect the "mutable input" returns to come last.
+        returns = original_returns + returns_from_mutable_inputs
+
+        args_sig = self.arguments.signature(strip_default=strip_default)
+        # See Note [bernoulli.p schema]
+        if str(self.name) == "bernoulli.p":
+            args_sig = Arguments.parse(str(args_sig).replace("float p", "float p=0.5"))
+
+        return FunctionSchema(
+            name=OperatorName(
+                name=BaseOperatorName(
+                    base=base_name,
+                    inplace=False,
+                    dunder_method=self.name.name.dunder_method,
+                ),
+                overload_name="",  # stripped
+            ),
+            arguments=args_sig,
+            returns=returns,
+        )
+
+    def view_signature(self) -> "FunctionSchema":
+        return self.signature(strip_view_copy_name=True)
+
+    def with_name(self, name: "OperatorName") -> "FunctionSchema":
+        return FunctionSchema(
+            name=name,
+            arguments=self.arguments,
+            returns=self.returns,
+        )
+
+    @property
+    def modifies_arguments(self) -> bool:
+        return self.kind() in [SchemaKind.inplace, SchemaKind.out, SchemaKind.mutable]
+
+    def has_symint(self) -> bool:
+        return self.arguments.has_symint_arg()
+
+    def __str__(self) -> str:
+        all_arguments_str = str(self.arguments)
+        if len(self.returns) == 1:
+            returns = str(self.returns[0])  # omit parentheses
+        else:
+            returns = "(" + ", ".join(map(str, self.returns)) + ")"
+        return f"{self.name}({all_arguments_str}) -> {returns}"
+
+
+# Here is the rest of the data model, described more briefly.
+
+
+# Simplified version for what actually shows up in built-ins.
+# Look at alias_info.h for expanded syntax.  If you need the structure,
+# you also need to make this structure recursive so it can be lined
+# up with the type components too.  For primitives this isn't really
+# necessary
+@dataclass(frozen=True)
+class Annotation:
+    # Typically only has one element.  Not actually a set so
+    # we can conveniently assume it is canonically ordered
+    alias_set: Tuple[str, ...]
+    is_write: bool
+    alias_set_after: Tuple[str, ...]
+
+    @staticmethod
+    def parse(ann: str) -> "Annotation":
+        # TODO: implement a proper parser if this gets more ugly
+        # Regex Explanation:
+        # Example: "a! -> a|b"
+        # Group #1: alias before optional '|', required. Matches the first
+        #   character 'a' in the example
+        # Group #2: optional alias set after optional '|', matches empty string
+        #   in the example
+        # Group #3: optional "is write" flag, matches '!' in the example.
+        # Group #4: optional section containing arrow, matches " -> a|b" in the
+        #   example.
+        # Group #5: optional alias after set, supports wildcard, matches "a|b"
+        #   in the example.
+        # Group #6: optional sub-section of alias after set, matches "|b" in the
+        #   example.
+        m = re.match(r"^([a-z])(\|[a-z])*(!?)( -> (\*|[a-z](\|[a-z])*))?$", ann)
+
+        assert m is not None, f"unrecognized alias annotation {ann}"
+        before_alias = m.group(1) + (m.group(2) if m.group(2) else "")
+        alias_set = tuple(before_alias.split("|"))
+        is_write = m.group(3) == "!"
+        assert not (
+            is_write and len(alias_set) > 1
+        ), f"alias set larger than 1 is not mutable, got {ann} instead."
+        after_set = tuple(m.group(5).split("|")) if m.group(5) else tuple()
+        assert not (
+            len(before_alias) > 1 and len(after_set) > 1
+        ), f"before alias set and after alias set cannot be larger than 1 at the same time, got {ann} instead."
+        r = Annotation(
+            alias_set=alias_set, is_write=is_write, alias_set_after=after_set
+        )
+        assert str(r) == ann, f"{r} != {ann}"
+        return r
+
+    def __str__(self) -> str:
+        alias_set = "|".join(self.alias_set)
+        if self.is_write:
+            alias_set = f"{alias_set}!"
+        alias_set_after = "|".join(self.alias_set_after)
+        if alias_set_after:
+            alias_set = f'{alias_set}{" -> "}{alias_set_after}'
+        return alias_set
+
+
+# The base class for the type system.  This is also loosely modeled
+# off of jit_type.h, but we've simplified the hierarchy to focus
+# in on the aspects of the type system that matter for code generation
+# (for example, there's no SingleElementType subclass anymore).
+# You never actually construct a Type; usually it's going to be one
+# of the subclasses.  If Python had ADTs this would be one!
+@dataclass(frozen=True)
+class Type:
+    @staticmethod
+    def parse(t: str) -> "Type":
+        r = Type._parse(t)
+        assert str(r) == t, f"{r} != {t}"
+        return r
+
+    @staticmethod
+    def _parse(t: str) -> "Type":
+        m = re.match(r"^(.+)\?$", t)
+        if m is not None:
+            return OptionalType(Type.parse(m.group(1)))
+        m = re.match(r"^(.+)\[([0-9]+)?\]$", t)
+        if m is not None:
+            size = int(m.group(2)) if m.group(2) is not None else None
+            return ListType(elem=Type.parse(m.group(1)), size=size)
+
+        # '__torch__.torch.classes.' is the prefix for custom class
+        m = re.match(r"^__torch__\.torch\.classes\.([a-zA-Z0-9_.]+)$", t)
+        if m is not None:
+            return CustomClassType(m.group(1))
+        try:
+            return BaseType(BaseTy[t])
+        except KeyError as e:
+            raise RuntimeError(f"unrecognized type {t}") from e
+
+    def __str__(self) -> str:
+        raise NotImplementedError
+
+    # WARNING: These concepts are not very well-defined.  For example,
+    # is "int?" nullable? How about "int?[]".  They are defined
+    # so we can conveniently generate legacy Declarations.yaml but
+    # really we should probably just remove these at some point
+
+    def is_base_ty_like(self, base_ty: "BaseTy") -> bool:
+        raise NotImplementedError
+
+    def is_tensor_like(self) -> bool:
+        return self.is_base_ty_like(BaseTy.Tensor)
+
+    def is_generator_like(self) -> bool:
+        return self.is_base_ty_like(BaseTy.Generator)
+
+    def is_symint_like(self) -> bool:
+        return self.is_base_ty_like(BaseTy.SymInt)
+
+    def is_nullable(self) -> bool:
+        raise NotImplementedError
+
+    def is_list_like(self) -> Optional["ListType"]:
+        raise NotImplementedError
+
+
+# Base types are simple, atomic types with no further structure
+class BaseTy(Enum):
+    Generator = auto()
+    ScalarType = auto()
+    Tensor = auto()
+    int = auto()
+    Dimname = auto()
+    DimVector = auto()
+    float = auto()
+    str = auto()
+    bool = auto()
+    Layout = auto()
+    Device = auto()
+    DeviceIndex = auto()
+    Scalar = auto()
+    MemoryFormat = auto()
+    QScheme = auto()
+    Storage = auto()
+    Stream = auto()
+    SymInt = auto()
+    ConstQuantizerPtr = auto()  # TODO: rename
+
+
+@dataclass(frozen=True)
+class BaseType(Type):
+    name: BaseTy
+
+    def __str__(self) -> str:
+        return f"{self.name.name}"
+
+    def is_base_ty_like(self, base_ty: BaseTy) -> bool:
+        return self.name == base_ty
+
+    def is_nullable(self) -> bool:
+        return False
+
+    def is_list_like(self) -> Optional["ListType"]:
+        return None
+
+    def is_symint_like(self) -> bool:
+        return self.name == BaseTy.SymInt
+
+
+# Optional types may be specified, or may also be validly given None
+@dataclass(frozen=True)
+class OptionalType(Type):
+    elem: Type
+
+    def __str__(self) -> str:
+        return f"{self.elem}?"
+
+    def is_base_ty_like(self, base_ty: BaseTy) -> bool:
+        return self.elem.is_base_ty_like(base_ty)
+
+    def is_symint_like(self) -> bool:
+        return self.elem.is_symint_like()
+
+    def is_nullable(self) -> bool:
+        return True
+
+    def is_list_like(self) -> Optional["ListType"]:
+        return self.elem.is_list_like()
+
+
+# A type representing a PyTorch custom class
+@dataclass(frozen=True)
+class CustomClassType(Type):
+    class_name: str
+
+    def __str__(self) -> str:
+        """
+        Return the class name will prefix __torch__.torch.classes
+        """
+        return f"__torch__.torch.classes.{self.class_name}"
+
+    def is_base_ty_like(self, base_ty: BaseTy) -> bool:
+        return False
+
+    def is_symint_like(self) -> bool:
+        return False
+
+    def is_nullable(self) -> bool:
+        """
+        Assume a custom class is not nullable.
+        """
+        return False
+
+    def is_list_like(self) -> Optional["ListType"]:
+        return None
+
+
+# List types specify that we may have multiples of an element.  We
+# also support explicit sizes on list types, but these have
+# some nontrivial semantics!  (However, for C++ API purposes, explicit
+# sizes are mostly erased from the type system.)
+#
+# DANGER WILL ROBINSON: C++ elaboration depends on elem type; e.g.,
+# int[] elaborates differently than bool[3]!
+@dataclass(frozen=True)
+class ListType(Type):
+    elem: Type
+    size: Optional[int]
+
+    def __str__(self) -> str:
+        size = f"{self.size}" if self.size else ""
+        return f"{self.elem}[{size}]"
+
+    def is_base_ty_like(self, base_ty: BaseTy) -> bool:
+        return self.elem.is_base_ty_like(base_ty)
+
+    def is_symint_like(self) -> bool:
+        return self.elem.is_symint_like()
+
+    def is_nullable(self) -> bool:
+        return self.elem.is_nullable()
+
+    def is_list_like(self) -> Optional["ListType"]:
+        return self
+
+
+@dataclass(frozen=True)
+class Argument:
+    # NB: I didn't put kwarg_only as a boolean field here, unlike
+    # c10::Argument, so that printing works correctly
+
+    name: str
+    type: Type
+    default: Optional[str]
+
+    # The semantics of the annotation field are a little strange.
+    #
+    # Alias annotations parametrize Tensors (since Tensors are the only things
+    # that can alias.)  This motivates why I write Tensor(a!)?  (and not, for
+    # example, Tensor?(a!)), because the (a!) describes aliasing on the tensor,
+    # which may be optional (i.e., the alias annotation should bind first to
+    # Tensor, before the optional postfix annotation).
+    #
+    # However, despite being a property of Tensor, we (and c10::Argument)
+    # store the annotation at the top level of the Argument, rather than
+    # inside the embedded Tensor type.  In the C++ version of this
+    # class, we then go through great lengths to mimic the type
+    # structure in the annotation structure so we can correlate
+    # annotations with types.
+    #
+    # Now, it turns out, in all applications in code generation, the
+    # structure of annotated types is very simple.  So we just hard
+    # code it here.  But if we ever do get anything more complex, this
+    # model will have to change!
+    annotation: Optional[Annotation]
+
+    @staticmethod
+    def parse(arg: str) -> "Argument":
+        name: str
+        default: Optional[str]
+        type_and_annot, name_and_default = arg.rsplit(" ", 1)
+        if "=" in name_and_default:
+            name, default = name_and_default.split("=")
+        else:
+            name = name_and_default
+            default = None
+        # TODO: deduplicate annotation matching with Return
+        match = re.match(r"Tensor\((.+)\)(.*)", type_and_annot)
+        annotation: Optional[Annotation]
+        if match:
+            # If you update this, make sure the __str__ still works too
+            assert match.group(2) in [
+                "",
+                "?",
+                "[]",
+            ], "unrecognized alias analysis form with Tensor"
+            type_s = "Tensor" + match.group(2)
+            annotation = Annotation.parse(match.group(1))
+        else:
+            type_s = type_and_annot
+            annotation = None
+        type = Type.parse(type_s)
+        r = Argument(
+            name=name,
+            type=type,
+            default=default,
+            annotation=annotation,
+        )
+        assert str(r) == arg, f"{str(r)} != {arg}"
+        return r
+
+    @property
+    def is_write(self) -> bool:
+        return self.annotation is not None and self.annotation.is_write
+
+    def __str__(self) -> str:
+        type = f"{self.type}"
+        if self.annotation:
+            assert type in ["Tensor", "Tensor?", "Tensor[]"]
+            type = type.replace("Tensor", f"Tensor({self.annotation})")
+        if self.name is None:
+            return type
+        else:
+            mb_default = ""
+            if self.default:
+                mb_default = f"={self.default}"
+            return f"{type} {self.name}{mb_default}"
+
+
+@dataclass(frozen=True)
+class Return:
+    name: Optional[str]
+    type: Type
+    annotation: Optional[Annotation]
+
+    @staticmethod
+    def parse(arg: str) -> "Return":
+        name: Optional[str]
+        if " " in arg:
+            type_and_annot, name = arg.rsplit(" ", 1)
+        else:
+            type_and_annot = arg
+            name = None
+        match = re.match(r"Tensor\((.+)\)(.*)", type_and_annot)
+        annotation: Optional[Annotation]
+        if match:
+            # If you update this, make sure the __str__ still works too
+            assert match.group(2) in [
+                "",
+                "?",
+                "[]",
+            ], "unrecognized alias analysis form with Tensor"
+            type_s = "Tensor" + match.group(2)
+            annotation = Annotation.parse(match.group(1))
+        else:
+            type_s = type_and_annot
+            annotation = None
+        type = Type.parse(type_s)
+        r = Return(
+            name=name,
+            type=type,
+            annotation=annotation,
+        )
+        assert str(r) == arg, f"{str(r)} != {arg}"
+        return r
+
+    @property
+    def is_write(self) -> bool:
+        return self.annotation is not None and self.annotation.is_write
+
+    def __str__(self) -> str:
+        type = f"{self.type}"
+        if self.annotation:
+            assert type in ["Tensor", "Tensor?", "Tensor[]"]
+            type = type.replace("Tensor", f"Tensor({self.annotation})")
+        if self.name is None:
+            return type
+        else:
+            return f"{type} {self.name}"
+
+
+# Represents the self argument for functions that may be methods
+@dataclass(frozen=True)
+class SelfArgument:
+    argument: Argument
+
+
+# Bundle of arguments that represent a TensorOptions.  This is mostly
+# relevant for the public C++ API but we bake it into the core data
+# model because other APIs often have to interact with it
+@dataclass(frozen=True)
+class TensorOptionsArguments:
+    dtype: Argument
+    layout: Argument
+    device: Argument
+    pin_memory: Argument
+
+    def all(self) -> Sequence[Argument]:
+        return [self.dtype, self.layout, self.device, self.pin_memory]
+
+
+@dataclass(frozen=True)
+class Arguments:
+    # pre_self_positional is usually empty, but is notably non-empty
+    # for where.self, where the condition argument comes before the
+    # self argument
+    pre_self_positional: Tuple[Argument, ...]
+    self_arg: Optional[SelfArgument]
+    post_self_positional: Tuple[Argument, ...]
+
+    pre_tensor_options_kwarg_only: Tuple[Argument, ...]
+    tensor_options: Optional[TensorOptionsArguments]
+    # post_tensor_options is typically memory format, which should be
+    # part of tensor options but isn't right now, and is usually
+    # placed after the tensor options arguments
+    post_tensor_options_kwarg_only: Tuple[Argument, ...]
+
+    # Unlike in the previous codegen, we have factored out 'out' arguments
+    # in the canonical representation, removing them from kwarg
+    # arguments.  This choice is justified by numerous downstream
+    # transformations which treat out arguments specially; additionally,
+    # you can see that canonicity is not violated!
+    out: Tuple[Argument, ...]  # these are also kwarg-only
+
+    @property
+    def flat_non_out(self) -> Sequence[Argument]:
+        ret: List[Argument] = []
+        ret.extend(self.flat_positional)
+        ret.extend(self.flat_kwarg_only)
+        return ret
+
+    @property
+    def flat_positional(self) -> Sequence[Argument]:
+        ret: List[Argument] = []
+        ret.extend(self.pre_self_positional)
+        if self.self_arg is not None:
+            ret.append(self.self_arg.argument)
+        ret.extend(self.post_self_positional)
+        return ret
+
+    @property
+    def post_self_positional_mutable(self) -> Sequence[Argument]:
+        return [a for a in self.post_self_positional if a.is_write]
+
+    # NB: doesn't contain out arguments
+    @property
+    def flat_kwarg_only(self) -> Sequence[Argument]:
+        ret: List[Argument] = []
+        ret.extend(self.pre_tensor_options_kwarg_only)
+        if self.tensor_options is not None:
+            ret.extend(self.tensor_options.all())
+        ret.extend(self.post_tensor_options_kwarg_only)
+        return ret
+
+    @property
+    def flat_all(self) -> Sequence[Argument]:
+        ret: List[Argument] = []
+        ret.extend(self.flat_positional)
+        ret.extend(self.flat_kwarg_only)
+        ret.extend(self.out)
+        return ret
+
+    @property
+    def non_out(
+        self,
+    ) -> Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]:
+        ret: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = []
+        ret.extend(self.positional)
+        ret.extend(self.kwarg_only)
+        return ret
+
+    @property
+    def positional(self) -> Sequence[Union[Argument, SelfArgument]]:
+        ret: List[Union[Argument, SelfArgument]] = []
+        ret.extend(self.pre_self_positional)
+        if self.self_arg is not None:
+            ret.append(self.self_arg)
+        ret.extend(self.post_self_positional)
+        return ret
+
+    @property
+    def kwarg_only(self) -> Sequence[Union[Argument, TensorOptionsArguments]]:
+        ret: List[Union[Argument, TensorOptionsArguments]] = []
+        ret.extend(self.pre_tensor_options_kwarg_only)
+        if self.tensor_options is not None:
+            ret.append(self.tensor_options)
+        ret.extend(self.post_tensor_options_kwarg_only)
+        return ret
+
+    @property
+    def all(self) -> Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]:
+        ret: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = []
+        ret.extend(self.positional)
+        ret.extend(self.kwarg_only)
+        ret.extend(self.out)
+        return ret
+
+    def mutable_arg_names(self) -> List[str]:
+        return [
+            a.name
+            for a in self.flat_all
+            if a.annotation is not None and a.annotation.is_write
+        ]
+
+    def has_tensor_arg(self) -> bool:
+        return any(a.type.is_tensor_like() for a in self.flat_non_out)
+
+    def has_symint_arg(self) -> bool:
+        return any(a.type.is_symint_like() for a in self.flat_non_out)
+
+    def has_generator_arg(self) -> bool:
+        return any(a.type.is_generator_like() for a in self.flat_non_out)
+
+    def signature(self, *, strip_default: bool = False) -> "Arguments":
+        # dataclasses.replace could be used here, but it is less
+        # type safe so for now I've opted to type everything out
+        def strip_arg_annotation(a: Argument) -> Argument:
+            return Argument(
+                name=a.name,
+                type=a.type,
+                default=a.default if not strip_default else None,
+                annotation=None,
+            )
+
+        return Arguments(
+            pre_self_positional=tuple(
+                map(strip_arg_annotation, self.pre_self_positional)
+            ),
+            self_arg=SelfArgument(strip_arg_annotation(self.self_arg.argument))
+            if self.self_arg is not None
+            else None,
+            post_self_positional=tuple(
+                map(strip_arg_annotation, self.post_self_positional)
+            ),
+            # Since TensorOptions are dropped, the post_tensor_options_kwargs are
+            # converted to pre_tensor_options_kwargs
+            pre_tensor_options_kwarg_only=tuple(
+                map(strip_arg_annotation, self.pre_tensor_options_kwarg_only)
+            )
+            + tuple(map(strip_arg_annotation, self.post_tensor_options_kwarg_only)),
+            # TensorOptions are dropped in signature,
+            # so we can pair factory functions with their out= variants.
+            tensor_options=None,
+            post_tensor_options_kwarg_only=tuple(),
+            # out arguments are dropped in signature
+            out=(),
+        )
+
+    def remove_self_annotation(self) -> "Arguments":
+        assert self.self_arg is not None
+        return dataclasses.replace(
+            self,
+            self_arg=SelfArgument(
+                dataclasses.replace(self.self_arg.argument, annotation=None)
+            ),
+        )
+
+    def with_out_args(self, outs: List[Argument]) -> "Arguments":
+        assert len(self.out) == 0
+        return dataclasses.replace(
+            self,
+            out=tuple(outs),
+        )
+
+    @staticmethod
+    def _preparse(args: str) -> Tuple[List[Argument], List[Argument], List[Argument]]:
+        positional: List[Argument] = []
+        kwarg_only: List[Argument] = []
+        out: List[Argument] = []
+        arguments_acc = positional
+
+        # TODO: Use a real parser here; this will get bamboozled
+        # by signatures that contain things like std::array<bool, 2> (note the space)
+        for arg in args.split(", "):
+            if not arg:
+                continue
+            if arg == "*":
+                assert (
+                    arguments_acc is positional
+                ), "invalid syntax: kwarg-only specifier * can only occur once"
+                arguments_acc = kwarg_only
+                continue
+            parg = Argument.parse(arg)
+            # Currently, we rely directly on the invariant that there are NO
+            # kwarg-only mutating arguments.  If you want to relax this,
+            # we will need a more semantic way of matching that takes
+            # into account return arguments.  In that case, you will have
+            # to manage out computation a level up, in FunctionSchema.  See Note
+            # [is_out_fn]
+            if parg.annotation is not None and parg.annotation.is_write:
+                if arguments_acc is positional:
+                    pass  # do nothing
+                elif arguments_acc is kwarg_only:
+                    arguments_acc = out
+            else:
+                assert arguments_acc is not out
+            arguments_acc.append(parg)
+
+        return positional, kwarg_only, out
+
+    @staticmethod
+    def parse(args: str) -> "Arguments":
+        """
+        Input: 'int x, int y, int z'
+        """
+
+        # We do this in two phases.  First we parse into three
+        # main categories: positional, kwarg_only, out.
+        # Then, we reparse positional and kwarg_only to separate
+        # out the self argument and tensor options arguments.
+
+        positional, kwarg_only, out = Arguments._preparse(args)
+
+        # Split self argument
+        self_ix = None
+        for i, a in enumerate(positional):
+            if a.name == "self":
+                self_ix = i
+                break
+        pre_self_positional: List[Argument]
+        self_arg: Optional[SelfArgument]
+        post_self_positional: List[Argument]
+        if self_ix is not None:
+            pre_self_positional = positional[:self_ix]
+            self_arg = SelfArgument(positional[self_ix])
+            post_self_positional = positional[self_ix + 1 :]
+        else:
+            pre_self_positional = []
+            self_arg = None
+            post_self_positional = positional
+
+        # Group tensor options arguments
+        pre_tensor_options_kwarg_only: List[Argument] = []
+        tensor_options: Optional[TensorOptionsArguments] = None
+        post_tensor_options_kwarg_only: List[Argument] = []
+        kwarg_only_acc = pre_tensor_options_kwarg_only
+
+        def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
+            return lambda a: a.name == name and a.type in [ty, OptionalType(ty)]
+
+        predicates = [  # order matters
+            pred("dtype", Type.parse("ScalarType")),
+            pred("layout", Type.parse("Layout")),
+            pred("device", Type.parse("Device")),
+            pred("pin_memory", Type.parse("bool")),
+        ]
+
+        i = 0
+        while i < len(kwarg_only):
+            # If there is enough space...
+            if i <= len(kwarg_only) - len(predicates):
+                # And the next len(predicates) arguments look like TensorOptions arguments
+                if all(
+                    p(a)
+                    for p, a in zip(predicates, kwarg_only[i : i + len(predicates)])
+                ):
+                    assert kwarg_only_acc is pre_tensor_options_kwarg_only
+                    # Group them together as one argument
+                    tensor_options = TensorOptionsArguments(
+                        dtype=kwarg_only[i],
+                        layout=kwarg_only[i + 1],
+                        device=kwarg_only[i + 2],
+                        pin_memory=kwarg_only[i + 3],
+                    )
+                    i += len(predicates)
+                    kwarg_only_acc = post_tensor_options_kwarg_only
+                    continue
+            kwarg_only_acc.append(kwarg_only[i])
+            i += 1
+
+        return Arguments(
+            pre_self_positional=tuple(pre_self_positional),
+            self_arg=self_arg,
+            post_self_positional=tuple(post_self_positional),
+            pre_tensor_options_kwarg_only=tuple(pre_tensor_options_kwarg_only),
+            tensor_options=tensor_options,
+            post_tensor_options_kwarg_only=tuple(post_tensor_options_kwarg_only),
+            out=tuple(out),
+        )
+
+    def __str__(self) -> str:
+        all_arguments: List[str] = []
+        all_arguments.extend(map(str, self.flat_positional))
+        if self.flat_kwarg_only or self.out:
+            all_arguments.append("*")
+        all_arguments.extend(map(str, self.flat_kwarg_only))
+        all_arguments.extend(map(str, self.out))
+        return ", ".join(all_arguments)
+
+    def __post_init__(self) -> None:
+        # TODO: These invariants are weirdly asymmetric?
+        # TODO: Fancier types?
+        if self.self_arg is None:
+            assert not self.pre_self_positional
+        if self.tensor_options is None:
+            assert not self.post_tensor_options_kwarg_only
+
+        # We don't allow any of the following to have argument annotations,
+        # to keep things simple.
+        mutable_pre_self_positionals = [
+            a
+            for a in self.pre_self_positional
+            if a.annotation is not None and a.annotation.is_write
+        ]
+        assert (
+            len(mutable_pre_self_positionals) == 0
+        ), "mutable pre_self_positional arguments are not currently supported in the schema"
+
+
+# Names that validly are __iXXX__ indicating inplace operations.
+# Taken from https://www.python.org/dev/peps/pep-0203/#new-methods
+# NB: PyTorch hasn't actually implemented all of these
+AUGMENTED_ASSIGNMENT_NAMES = [
+    "add",
+    "sub",
+    "mul",
+    "div",
+    "mod",
+    "pow",
+    "lshift",
+    "rshift",
+    "and",
+    "xor",
+    "or",
+]
+
+
+# A BaseOperatorName is what we think of the operator name, without
+# the overload name.  Unusually, we don't represent this as just a
+# string; instead, we directly represent a few important semantic
+# bits of information we derive from the string: namely whether
+# or not it's inplace (add_) and whether or not it's a double-underscore
+# method (__add__)
+@dataclass(frozen=True)
+class BaseOperatorName:
+    base: str
+    inplace: bool
+    dunder_method: bool
+    # Note [Overload Ambiguity With Functional Variants]
+    # A handful of operators have both a "mutable" and a "functional" variant.
+    # (native_batch_norm is a good example, although this isn't the case today).
+    # For those operators, the mutable and functional variant take in the same set of
+    # arguments, but have different alias annotations.
+    # this makes it ambiguous when you try to resolve an OverloadPacket into an overload,
+    # given a set of input arguments.
+    #
+    # So instead of making the "functional" variant in this case a real overload, e.g:
+    #   native_batch_norm (mutable variant)
+    #   native_batch_norm.functional (functional variant)
+    # we make it a new base operator,
+    #   native_batch_norm_functional (functional variant)
+    #
+    # In an ideal world, we would probably invert this so the operators were:
+    #   native_batch_norm.mutable (mutable variant)
+    #   native_batch_norm (functional variant)
+    #
+    # Doing that is BC-breaking though, so we're stuck with the above modeling.
+    functional_overload: bool = False
+
+    @staticmethod
+    def parse(op: str) -> "BaseOperatorName":
+        assert op != ""
+        assert not op.endswith("_out"), (
+            "_out suffix is reserved and not permitted for operator names; "
+            "did you mean to specify an out overload name instead?"
+        )
+        m = re.match(r"^__([^_]+)__$", op)
+        if m is not None:
+            dunder_method = True
+            base = m.group(1)
+            if any(base == f"i{n}" for n in AUGMENTED_ASSIGNMENT_NAMES):
+                inplace = True
+                base = base[1:]
+            else:
+                inplace = False
+                # temporary, this is not intrinsically true but
+                # has been historically true for dunder methods
+                # we support  (but, if we ever got, say, __int__, this would
+                # be wrong!)
+                assert base[0] != "i"
+        else:
+            dunder_method = False
+            base = op
+            if base[-1] == "_":
+                inplace = True
+                base = base[:-1]
+            else:
+                inplace = False
+
+        # See Note [Overload Ambiguity With Functional Variants]
+        functional_suffix = "_functional"
+        if base.endswith(functional_suffix):
+            functional_overload = True
+            base = base[: -len(functional_suffix)]
+            # This seems complicated and unnecessary, so banning dunder methods
+            # for now on ops that have a functional + mutable variant (like native_batch_norm).
+            assert not dunder_method and not inplace
+        else:
+            functional_overload = False
+
+        r = BaseOperatorName(
+            base=base,
+            inplace=inplace,
+            dunder_method=dunder_method,
+            functional_overload=functional_overload,
+        )
+        assert str(r) == op, f"{str(r)} != {op}"
+        return r
+
+    def __str__(self) -> str:
+        if self.dunder_method:
+            i = "i" if self.inplace else ""
+            return f"__{i}{self.base}__"
+        else:
+            i = (
+                "_"
+                if self.inplace
+                else "_functional"
+                if self.functional_overload
+                else ""
+            )
+            return f"{self.base}{i}"
+
+
+# Operator name is the base operator name along with the (typically not
+# user visible) overload string.
+@dataclass(frozen=True)
+class OperatorName:
+    name: BaseOperatorName
+    overload_name: str
+
+    @staticmethod
+    def parse(op_name: str) -> "OperatorName":
+        if "." in op_name:
+            name, overload_name = op_name.split(".", 1)
+        else:
+            name = op_name
+            overload_name = ""
+        r = OperatorName(name=BaseOperatorName.parse(name), overload_name=overload_name)
+        assert str(r) == op_name, f"{str(r)} != {op_name}"
+        return r
+
+    def __str__(self) -> str:
+        if self.overload_name:
+            return f"{self.name}.{self.overload_name}"
+        else:
+            return f"{self.name}"
+
+    # NB: This must be synchronized with the naming scheme in
+    # aten/src/ATen/templates/Operators.h
+    # Given a function schema "aten::op.overload(...)",
+    # If there is no overload name, this returns f"{op}"
+    # If there is an overload name, this returns f"{op}_{overload}"
+    def unambiguous_name(self) -> str:
+        if self.overload_name:
+            return f"{self.name}_{self.overload_name}"
+        else:
+            return f"{self.name}"
+
+    def remove_inplace(self) -> "OperatorName":
+        return OperatorName(
+            name=BaseOperatorName(
+                base=self.name.base,
+                inplace=False,
+                dunder_method=self.name.dunder_method,
+            ),
+            overload_name=self.overload_name,
+        )
+
+    def with_overload(self, overload: str) -> "OperatorName":
+        return OperatorName(
+            name=BaseOperatorName(
+                base=self.name.base,
+                inplace=False,
+                dunder_method=self.name.dunder_method,
+            ),
+            overload_name=overload,
+        )
+
+
+def gets_generated_out_inplace_wrapper(
+    f: NativeFunction, g: NativeFunctionsGroup, b: BackendIndex
+) -> bool:
+    return (
+        f.func.kind() is not SchemaKind.functional
+        and not b.has_kernel(f)
+        and b.has_kernel(g.functional)
+    )
+
+
+# NativeFunction objects that are views (f.is_view_op returns True)
+# are added into a `NativeFunctionsViewGroup`, which we can use to
+# easily access the generated (optional) view_copy NativeFunction.
+# It's convenient to group them together, so we pair them up in NativeFunctionsViewGroup.
+# See Note [Codegen'd {view}_copy Operators]
+#
+# One property of this representation is that in order for a view-like op to be part of
+# a NativeFunctionsViewGroup, the "aliasing" version of that view op must exist.
+# There's one case where that doesn't happen: we have a non-aliasing `narrow_copy.out` op,
+# but don't have corresponding aliasing `narrow.out` op.
+# This means that `narrow_copy.out` won't appear as a NativeFunctionsViewGroup.
+@dataclass(frozen=True)
+class NativeFunctionsViewGroup:
+    view: NativeFunction
+    # Note: the {view}_copy operator is optional because we currently don't generate copy variants
+    # for all view ops. Notably, we don't generate them for CompositeImplicitAutograd views
+    # (we already get them "for free" through decomposition)
+    view_copy: Optional[NativeFunction]
+    # view_inplace ops are also optional, but every view_inplace op should have out-of-place variant.
+    view_inplace: Optional[NativeFunction]
+
+    def __post_init__(self) -> None:
+        assert self.view.is_view_op
+        if self.view_copy is None:
+            assert not gets_generated_view_copy(self.view), (
+                f"{str(self.view.func.name)} appears to be a new operator that aliases its inputs."
+                " The codegen expects you to add a corresponding operator to native_functions.yaml:"
+                f" {get_view_copy_name(self.view)!s}."
+                " See Note [view_copy NativeFunctions] for details."
+            )
+        else:
+            assert self.view_copy.func.name.name.base.endswith(("_copy", "_scatter"))
+            assert self.view.func.signature() == self.view_copy.func.signature(
+                strip_view_copy_name=True,
+            )
+            assert "view_copy" in self.view_copy.tags, (
+                f"{str(self.view_copy.func.name), str(self.view.tags)} appears to be a view_copy operator. The codegen expects"
+                " view_copy operators to be annotated with the 'view_copy' tag in native_functions.yaml."
+                " See Note [view_copy NativeFunction] for details."
+            )
+        if self.view_inplace is not None:
+            assert self.view.func.signature() == self.view_inplace.func.signature()
+
+        if self.view.has_composite_implicit_autograd_kernel:
+            if self.view_inplace is not None:
+                assert self.view_inplace.has_composite_implicit_autograd_kernel, (
+                    f"{str(self.view.func.name)} and {str(self.view_inplace.func.name)} must either"
+                    " both have CompositeImplicitAutograd kernels, or both not have composite kernels."
+                )
+        if self.view.has_composite_implicit_autograd_nested_tensor_kernel:
+            if self.view_inplace is not None:
+                assert (
+                    self.view_inplace.has_composite_implicit_autograd_nested_tensor_kernel
+                ), (
+                    f"{str(self.view.func.name)} and {str(self.view_inplace.func.name)} must either"
+                    " both have CompositeImplicitAutogradNestedTensor kernels, or both not have composite kernels."
+                )
+
+    def functions(self, *, include_copy: bool = True) -> Iterator[NativeFunction]:
+        yield self.view
+        if self.view_inplace is not None:
+            yield self.view_inplace
+        if self.view_copy is not None and include_copy:
+            yield self.view_copy
+
+    @property
+    def root_name(self) -> str:
+        return self.view.root_name
+
+    @property
+    def composite(self) -> bool:
+        # We currently assert that the "group" is consistent.
+        # If the view op is composite, then its view_inplace op is too.
+        return self.view.has_composite_implicit_autograd_kernel
+
+
+def gets_generated_view_copy(f: NativeFunction) -> bool:
+    # Only aliasing (view) operators get a copy variant.
+    if not f.is_view_op:
+        return False
+    # We don't need to bother generating copy variants for CompositeImplicitAutograd ops,
+    # because we can let them decompose into base view ops.
+    if f.has_composite_implicit_autograd_kernel:
+        return False
+    # We also don't need to generate copy variants for inplace views.
+    if "inplace_view" in f.tags:
+        return False
+    # Assume ops ending in _inverse have manually-defined copy variants
+    # (e.g. slice_inverse() has the copy variant slice_scatter()).
+    # We -could- probably generate these as well, but the codegen will be
+    # slightly different, and hand-writing these few kernels keeps codegen
+    # complexity lower.
+    if f.func.name.name.base.endswith("_inverse"):
+        return False
+    return True
+
+
+# Given a NativeFunction that corresponds to a view op,
+# returns the OperatorName of the corresponding "copy" variant of the op.
+def get_view_copy_name(f: NativeFunction) -> "OperatorName":
+    # Right now, when asking for a view op's corresponding "view_copy" name
+    # we assert for sanity that the op is allowed to have a generated view_copy variant.
+    # (We can do this because "gets_generated_view_copy()" tell us which ops get a generated view_copy op).
+    # However, narrow_copy() already exists as an op directly in native_functions.yaml.
+    # I'm hardcoding narrow_copy here for now to maintain the assert,
+    # But we could also just get rid of the assert.
+    list_of_ops_with_explicit_view_copy_operators = ["narrow"]
+    if str(f.func.name) not in list_of_ops_with_explicit_view_copy_operators:
+        assert gets_generated_view_copy(f)
+
+    base_name = f"{f.func.name.name.base}_copy"
+    view_copy_name = OperatorName(
+        name=BaseOperatorName(
+            base=base_name, inplace=False, dunder_method=f.func.name.name.dunder_method
+        ),
+        overload_name=f.func.name.overload_name,
+    )
+    return view_copy_name
+
+
+# Helper functions for parsing argument lists (both inputs and returns)
+
+
+def parse_returns(return_decl: str) -> Tuple[Return, ...]:
+    """
+    Input: '()'
+    Output: []
+    """
+    if return_decl == "()":
+        return ()
+    if return_decl[0] == "(" and return_decl[-1] == ")":
+        return_decl = return_decl[1:-1]
+    return tuple(Return.parse(arg) for arg in return_decl.split(", "))
+
+
+# A Precompute instance consists of a map from kernel argument name
+# to the list of Argument instances that should replace that
+# kernel argument in the impl function.
+@dataclass(frozen=True)
+class Precompute:
+    # A map from kernel argument name -> a list of precomputed
+    # elements that replaces/supersedes it.
+    replace: Dict[str, List[Argument]]
+    # List of precomputed args added without replacement
+    add: List[Argument]
+
+    @staticmethod
+    def parse(src: object) -> "Precompute":
+        assert isinstance(src, list)
+
+        # src is a list of strings of the format:
+        #   {kernel param name} -> {replacement decl}[, {replacement decl}, ...]
+        #   [{add decl}[, {add decl}, ...]]
+        # The last line is optional and contains the precomputed parameters that are
+        # added without replacement.
+        # The other lines are parsed to get the names of which precomputed elements
+        # should replace which kernel arguments.
+        add_args = []
+        if " -> " not in src[-1]:
+            add_list = src[-1].split(",")
+            add_args = [Argument.parse(name.strip()) for name in add_list]
+            src = src[:-1]
+
+        replace = {}
+        for raw_replace_item in src:
+            assert isinstance(raw_replace_item, str)
+            assert " -> " in raw_replace_item, (
+                "precomputed parameters without replacement"
+                " are allowed only in the last line"
+            )
+
+            arg, with_list_raw = raw_replace_item.split(" -> ")
+            with_list = with_list_raw.split(",")
+            with_list_args = [Argument.parse(name.strip()) for name in with_list]
+            replace[arg] = with_list_args
+
+        r = Precompute(replace=replace, add=add_args)
+        assert r.to_list() == src, "r.to_list() != src"
+        return r
+
+    def __post_init__(self) -> None:
+        # the template parameters are upper so if these are the
+        # same then it is ambiguous
+        for a in self.add:
+            assert a.name.upper() != a.name
+        for args in self.replace.values():
+            for a in args:
+                assert a.name.upper() != a.name
+
+    def to_list(self) -> List[str]:
+        replace_list = []
+        for kernel_param, replacement_params in self.replace.items():
+            replacements = ", ".join(str(param) for param in replacement_params)
+            replace_list.append(f"{kernel_param} -> {replacements}")
+
+        return replace_list
diff --git a/MLPY/Lib/site-packages/torchgen/native_function_generation.py b/MLPY/Lib/site-packages/torchgen/native_function_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..3759e5b382b5ffc26dc92739e35899e61b16e974
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/native_function_generation.py
@@ -0,0 +1,643 @@
+from collections import defaultdict
+
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torchgen.api.dispatcher as dispatcher
+from torchgen.api.translate import translate
+from torchgen.api.types import Binding, DispatcherSignature, Expr
+from torchgen.context import with_native_function
+from torchgen.model import (
+    Annotation,
+    Argument,
+    BackendIndex,
+    BackendMetadata,
+    BaseOperatorName,
+    BaseTy,
+    BaseType,
+    DEFAULT_KERNEL_NAMESPACE,
+    DeviceCheckType,
+    DispatchKey,
+    FunctionSchema,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OperatorName,
+    Return,
+    SchemaKind,
+    Variant,
+)
+from torchgen.utils import concatMap
+
+# See Note: [Out ops with functional variants that don't get grouped properly]
+OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY = [
+    # This has a functional variant, but it's currently marked private.
+    # This function should be marked private as well (*_backward ops aren't exposed to python anyway).
+    "adaptive_avg_pool3d_backward.grad_input",
+    # There's a functional variant, _slow_conv2d_backward.output_mask, that isn't grouped properly.
+    # Maybe we can kill this operator in favor of convolution_backward?
+    "_slow_conv2d_backward.grad_input",
+]
+
+
+# See Note: [Mutable ops that cannot get an out variant]
+MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT = [
+    # should be out=?
+    "_cummax_helper",
+    # should be out=?
+    "_cummin_helper",
+]
+
+# All of these operators don't have any tensor like returns
+FUNCTIONAL_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT = [
+    "_assert_async",  # no return
+    "_assert_async.msg",  # no return
+    "_cslt_sparse_mm_search",  # returns an int
+    "_assert_scalar",  # no return
+    "_dimI",  # returns an int
+    "_dimV",  # returns an int
+    "_has_same_storage_numel",  # returns a boolean
+    "_linalg_check_errors",  # no return
+    "_local_scalar_dense",  # returns a Scalar
+    "_nested_tensor_from_mask_left_aligned",  # returns a boolean
+    "_nnz",  # returns an int
+    "_use_cudnn_ctc_loss",  # returns a boolean
+    "_use_cudnn_ctc_loss.Tensor",  # returns a boolean
+    "_validate_compressed_sparse_indices",  # no return
+    "allclose",  # returns a boolean
+    "dense_dim",  # returns an int
+    "equal",  # returns a boolean
+    "is_coalesced",  # returns an boolean
+    "is_pinned",  # returns a boolean
+    "is_same_size",  # returns a boolean
+    "is_set_to",  # returns a boolean
+    "q_per_channel_axis",  # returns an int
+    "q_scale",  # returns a float
+    "q_zero_point",  # returns an int
+    "qscheme",  # returns a QScheme
+    "record_stream",  # no return
+    "sparse_dim",  # returns an int
+    "sym_constrain_range",  # no return
+    "sym_constrain_range_for_size",  # no return
+    "_nested_tensor_storage_offsets",  # returns a vector of ints
+    "_chunk_grad_outputs_efficient_attention",  # returns a bool
+    "_fused_sdp_choice",  # returns an int
+    "_print",  # no return
+    "_nested_get_ragged_idx",  # returns an int
+]
+
+INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY = [
+    # polygamma and polygamma.out both exist, but have a
+    # pre-self arg (while polygamma_ does not)
+    # We should either fix this schema so it can be grouped properly,
+    # or allow the codegen to generate new functional/out= NativeFunctions for this op
+    # (which would require changing its overload name to prevent overload ambiguity).
+    "polygamma_"
+]
+
+
+# Groups "similar" NativeFunctions together
+# example add.Tensor, add_.Tensor, add.out
+# "similar" NativeFunctions are all expected to have an identical `signature()`,
+# But have differing SchemaKinds.
+def pre_group_native_functions(
+    native_functions: Sequence[NativeFunction],
+) -> Dict[FunctionSchema, Dict[SchemaKind, NativeFunction]]:
+    pre_grouped_native_functions: Dict[
+        FunctionSchema, Dict[SchemaKind, NativeFunction]
+    ] = defaultdict(dict)
+    for f in native_functions:
+        d = pre_grouped_native_functions[f.func.signature()]
+        assert f.func.kind() not in d
+        d[f.func.kind()] = f
+    return pre_grouped_native_functions
+
+
+# Returns the out variant overload name given a base function overload name
+def get_expected_out_variant_overload_name(overload_name: Optional[str]) -> str:
+    return "out" if not overload_name else f"{overload_name}_out"
+
+
+# Helper function: given an inplace FunctionSchema, generate its corresponding out= variant
+# Example before:
+#   _add_relu_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+# Example after:
+#   _add_relu.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out)
+def self_to_out_signature(func: FunctionSchema) -> FunctionSchema:
+    # Generating an out= schema from an inplace schema.
+    assert func.kind() == SchemaKind.inplace
+    assert func.arguments.self_arg is not None
+    # The new out= schema has:
+    # - a new out argument with the same type as "func" (but with a mutable annotation)
+    # - The returns (if any) now alias the out= argument instead of "func"
+    # - an "out" overload name
+    return FunctionSchema(
+        name=func.name.remove_inplace().with_overload(
+            get_expected_out_variant_overload_name(func.name.overload_name)
+        ),
+        arguments=func.arguments.remove_self_annotation().with_out_args(
+            [
+                Argument(
+                    name="out",
+                    type=func.arguments.self_arg.argument.type,
+                    default=None,
+                    annotation=func.arguments.self_arg.argument.annotation,
+                )
+            ]
+        ),
+        returns=func.returns,
+    )
+
+
+# Helper function: given a functional FunctionSchema, generate its corresponding out= variant
+# Example before:
+#   _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None,
+#       bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
+# Example after:
+#   _to_copy._out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None,
+#       Tensor(a!) out) -> Tensor(a!)
+def functional_to_out_signature(func: FunctionSchema) -> FunctionSchema:
+    # Generating an out= schema from a functional schema.
+    assert func.kind() == SchemaKind.functional
+
+    new_returns, new_out_args = generate_out_args_from_schema(func)
+    # The new out= schema has:
+    # - one or more new out argument(s) with the same type as returns (but with a mutable annotation)
+    # - The returns now alias the out= arguments
+    # - an "_out" overload name
+    return FunctionSchema(
+        name=func.name.with_overload(
+            get_expected_out_variant_overload_name(func.name.overload_name)
+        ),
+        arguments=func.arguments.signature().with_out_args(
+            new_out_args,
+        ),
+        returns=tuple(new_returns),
+    )
+
+
+# Helper function: given a function schema, generate corresponding out arguments, also the updated return annotations.
+def generate_out_args_from_schema(
+    func: FunctionSchema,
+) -> Tuple[List[Return], List[Argument]]:
+    # More of a sanity check - our existing restrictions on schemas should enforce that
+    # mutable schema kinds never return their mutable arguments.
+    assert not any(
+        r.annotation is not None and r.annotation.is_write for r in func.returns
+    )
+
+    tensorlike_rets = [r for r in func.returns if r.type.is_tensor_like()]
+    assert len(tensorlike_rets) > 0
+
+    used_annotations = concatMap(
+        lambda a: [] if a.annotation is None else a.annotation.alias_set,
+        func.arguments.flat_all,
+    )
+    valid_annotations = [
+        x for x in "abcdefghijklmnopqrstuvwxyz" if x not in used_annotations
+    ]
+
+    all_rets_are_tensors = all(r.type == BaseType(BaseTy.Tensor) for r in func.returns)
+
+    new_out_args: List[Argument] = []
+    # The end result of new_returns is that:
+    # - If every return is a plain tensor, then the new returns == the old returns, but with the out= alias annotations added.
+    # - Otherwise, none of the out arguments show up in the returns (and we're only left with non-tensor-like returns, if any).
+    new_returns: List[Return] = []
+    for i, r in enumerate(func.returns):
+        if r.type.is_tensor_like():
+            new_out = Argument(
+                name="out" if len(func.returns) == 1 else f"out{i}",
+                type=r.type,
+                default=None,
+                annotation=Annotation.parse(f"{valid_annotations[i]}!"),
+            )
+            new_out_args.append(new_out)
+            if all_rets_are_tensors:
+                # The convention for out= schemas is that they only return their out arguments
+                # if the return is a plain Tensor (or if it's a tuple of plain Tensors)
+                new_ret = Return(
+                    name=None, type=new_out.type, annotation=new_out.annotation
+                )
+                new_returns.append(new_ret)
+        else:
+            new_returns.append(r)
+    return new_returns, new_out_args
+
+
+# Helper function: given a mutable FunctionSchema, generate its corresponding out= variant
+# Example before:
+#   _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask)  # noqa: B950
+# Example after:
+#   _fused_moving_avg_obs_fq_helper._out(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False, *, Tensor(e!) out0, Tensor(f!) out1) -> (Tensor(e!), Tensor(f!))  # noqa: B950
+def mutable_to_out_signature(func: FunctionSchema) -> FunctionSchema:
+    # Generating an out= schema from a mutable schema.
+    assert func.kind() == SchemaKind.mutable
+    # The new out= schema has:
+    # - Any non-aliased tensor-like returns are converted to mutable, aliased out= arguments
+    #   (if the argument is a tensor then we also return it for method chaining,
+    #   otherwise we return nothing)
+    # - an "out" overload name
+    #
+    # Note that:
+    # (1) This also means that we can *only* generate an out= variant from a mutable schema
+    #     if the mutable schema has at least one tensor-like non-aliasing return.
+    # (2) The generated out= variant still has mutable positional arguments,
+    #     but if necessary we could probably add another out= variant that also
+    #     functionalizes the mutable arguments (a functional_out variant)
+
+    new_returns, new_out_args = generate_out_args_from_schema(func)
+
+    return FunctionSchema(
+        name=func.name.remove_inplace().with_overload(
+            get_expected_out_variant_overload_name(func.name.overload_name)
+        ),
+        arguments=func.arguments.with_out_args(new_out_args),
+        returns=tuple(new_returns),
+    )
+
+
+# This function, given function of one SchemaKind, as well as a target SchemaKind,
+# generates a new NativeFunction with the same properties, but using the target SchemaKind.
+# We only actually generate functions for either functional or out= SchemaKinds.
+# This function returns a tuple, with:
+# - The generated NativeFunction
+# - a dictionary of `BackendIndex` objects, describing which dispatch keys
+#   we will generate kernels for, for the new NativeFunction.
+#   Details are in the function, but we only generate composite kernels (in some cases) today.
+def generate_function(
+    f: NativeFunction, k: SchemaKind
+) -> Tuple[NativeFunction, Dict[DispatchKey, Dict["OperatorName", "BackendMetadata"]]]:
+    from torchgen.api import cpp
+
+    if k == SchemaKind.functional:
+        assert f.func.kind() != SchemaKind.functional
+        # The new "functional" NativeFunction has:
+        # - any mutable arguments have been converted into (immutable) returns.
+        #   (if a mutable argument was not also a return, it gets converted to one)
+        # - "_functional" appended to the base name, ONLY IF this op has a mutable variant.
+        #   See Note [Overload Ambiguity With Functional Variants]
+        # The default grouping logic in signature() actually already does this,
+        # so we can piggy-back off it (but we still want return names)
+        func = f.func.signature(keep_return_names=True).with_name(
+            OperatorName(
+                name=BaseOperatorName(
+                    base=f.func.name.name.base,
+                    inplace=False,
+                    dunder_method=f.func.name.name.dunder_method,
+                    # See Note [Overload Ambiguity With Functional Variants]
+                    functional_overload=f.func.kind() == SchemaKind.mutable,
+                ),
+                overload_name=f.func.name.overload_name,
+            )
+        )
+    elif k == SchemaKind.out:
+        # We generate out= ops mostly just so that we can pair up NativeFunctions into groups easily,
+        # but at least today, there is no good reason to actually use them.
+        # we'll generate a dispatcher entry for them, but won't actually register any kernels for them.
+        if f.func.kind() == SchemaKind.inplace:
+            func = self_to_out_signature(f.func)
+        elif f.func.kind() == SchemaKind.mutable:
+            func = mutable_to_out_signature(f.func)
+        elif f.func.kind() == SchemaKind.functional:
+            func = functional_to_out_signature(f.func)
+        else:
+            raise AssertionError(
+                "We only bother generating out= functions from either inplace or mutable or functional variants"
+            )
+    else:
+        raise AssertionError(
+            "We currently only generate either functional or out= NativeFunctions"
+        )
+
+    # Generated kernel naming convention for out: <op_name>_<overload_name>. The reason for this is to
+    # disambiguate operator with the same name but different overload name, e.g., `randn.names_out` and
+    # `randn.generator_with_names_out`.
+    kernel_name = (
+        func.name.unambiguous_name()
+        if func.kind() == SchemaKind.out
+        else cpp.name(func)
+    )
+    if f.func.has_symint():
+        kernel_name += "_symint"
+    backend_metadata = {
+        DispatchKey.CompositeExplicitAutograd: {
+            func.name: BackendMetadata(
+                kernel=kernel_name,
+                structured=False,
+                cpp_namespace=DEFAULT_KERNEL_NAMESPACE,
+            )
+        }
+    }
+    tags = {"generated"} | set(
+        f.tags & {"nondeterministic_seeded", "view_copy", "pt2_compliant_tag"}
+    )
+
+    return (
+        NativeFunction(
+            func=func,
+            use_const_ref_for_mutable_tensors=f.use_const_ref_for_mutable_tensors,
+            # These generated fn's aren't meant to be user friendly- don't generate methods.
+            variants={Variant.function},
+            structured=False,
+            structured_delegate=None,
+            structured_inherits=None,
+            precomputed=None,
+            autogen=[],
+            ufunc_inner_loop={},
+            manual_kernel_registration=False,
+            manual_cpp_binding=False,
+            python_module=None,
+            category_override=None,
+            device_guard=False,
+            device_check=DeviceCheckType.NoCheck,
+            loc=f.loc,
+            cpp_no_default_args=set(),
+            is_abstract=f.is_abstract,
+            has_composite_implicit_autograd_kernel=False,
+            has_composite_implicit_autograd_nested_tensor_kernel=False,
+            has_composite_explicit_autograd_kernel=True,
+            has_composite_explicit_autograd_non_functional_kernel=False,
+            # Every generated NativeFunction gets a "generated" tag, so it's easy to tell
+            # which NativeFunction objects did not come directly from native_functions.yaml.
+            tags=tags,
+            namespace=f.namespace,
+        ),
+        backend_metadata,
+    )
+
+
+# This function is responsible for adding generated NativeFunctions which don't appear
+# explicitly in the codegen.
+# You can inspect the full list of NativeFunctions yourself with the torchgen package, by running
+# torchgen.parse_native_yaml("aten/src/ATen/native/native_functions.yaml", "aten/src/ATen/native/tags.yaml")
+# (Maybe we should make a friendly API for this)
+#
+# Note: this function *mutates* its two inputs,
+# adding the new NativeFunctions / BackendMetadata to them
+def add_generated_native_functions(
+    rs: List[NativeFunction],
+    indices: Dict[DispatchKey, Dict[OperatorName, BackendMetadata]],
+) -> None:
+    # The main code for generating new NativeFunctions
+    # First we group of NativeFunctions by schema kind,
+    # then we detect which ones are missing and generate them.
+    pre_grouped_native_functions = pre_group_native_functions(rs)
+    for d in pre_grouped_native_functions.values():
+        has_functional = SchemaKind.functional in d
+        has_inplace = SchemaKind.inplace in d
+        has_mutable = SchemaKind.mutable in d
+        has_out = SchemaKind.out in d
+
+        # We automatically generate a few native functions that don't exist in the yaml, for a few reasons:
+        # (1) If an operator has an inplace/out= variant but no functional variant, we can generate
+        #     a simple functional variant that the functionalization pass can consume.
+        # (2) If an operator has an inplace or functional but no out= variant, we generate an out=
+        #     variant, mostly so we can easily pair up functions into NativeFunctionsGroup,
+        #     while maintaining the constraint that the out= variant is "required".
+        if has_mutable or has_inplace or has_out or has_functional:
+            # Don't bother generating functions trio's for native functions that bypass the dispatcher.
+            are_manual = all(f.manual_cpp_binding for f in d.values())
+            # Don't bother generating functional + out= variants for view operators
+            # set_ is technically an inplace_view, but for now it is treated
+            # as a normal inplace op in the codegen
+            has_view_ops = any(
+                f.is_view_op and str(f.func.name.name) != "set_" for f in d.values()
+            )
+            # Don't generate the other variants for CompositeImplicitAutograd operators.
+            # We could probably do this, but the main benefit of generating the function triplets
+            # is for transforms that need them, and transforms don't need to act directly
+            # on CompositeImplicitAutograd operators (since we let them decompose).
+            are_composite_implicit = all(
+                f.has_composite_implicit_autograd_kernel for f in d.values()
+            )
+            if are_manual or has_view_ops or are_composite_implicit:
+                continue
+            if has_out and len(d.values()) == 1:
+                # Note: [Out ops with functional variants that don't get grouped properly]
+                # In theory we could validly have an out= operator in native_functions.yaml
+                # that has no other variants.
+                # But today, all of the operators where that's the case actually do have
+                # functional variants, that we are just unable to pair up properly.
+                # I think banning this all together is probably safer
+                # (you can always add a functional variant yourself if you want to add a new out= operator).
+                #
+                # We should probably fix the existing cases; this check is to prevent us from adding more over time.
+                if (
+                    str(d[SchemaKind.out].func.name)
+                    not in OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY
+                ):
+                    raise AssertionError(
+                        f"Found an out= operator that we could not find any other variants of: {str(d[SchemaKind.out].func)}"
+                    )
+                continue
+
+            # Some inplace ops that have problematic schemas (that we should fix), which prevent us
+            # from generating out= and functional variants
+            if (
+                has_inplace
+                and str(d[SchemaKind.inplace].func.name)
+                in INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY
+            ):
+                continue
+
+            base_fn = (
+                d[SchemaKind.inplace]
+                if has_inplace
+                else d[SchemaKind.mutable]
+                if has_mutable
+                else d[SchemaKind.out]
+                if has_out
+                else d[SchemaKind.functional]
+            )
+
+            # Note: [Mutable ops that cannot get an out variant]
+            # We can only generate an out= variant if either:
+            # - the original function has tensor-like returns (since we can convert them to out kwargs)
+            # - or it's inplace (since we can convert `self` to an out kwarg)
+            # There are only two functions that don't fit this criteria today though,
+            # and they both look like they should be fixed to be out= variants,
+            # so if feels safer to ban this schema all-together
+            base_fn_valid = base_fn.func.kind() == SchemaKind.inplace or any(
+                r.type.is_tensor_like() for r in base_fn.func.returns
+            )
+            # Note: [Loosen the assertion that all functional should have out variant]
+            # By design all functional operators should have our variants. The needs_out check
+            # is loosening this requirement, changing it to only generate out variant if there's
+            # an `autogen` block in the native function, in the long run it should be removed.
+            # FIXME: Remove this after figuring out CI job failures related to min, max, mean
+            needs_out = any("out" in str(op_name) for op_name in base_fn.autogen)
+            gets_out_variant = not has_out and base_fn_valid and needs_out
+            if not has_out and not base_fn_valid:
+                if (
+                    str(base_fn.func.name)
+                    not in MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT
+                    and str(base_fn.func.name)
+                    not in FUNCTIONAL_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT
+                ):
+                    raise AssertionError(
+                        f"""Found an operator that we could not generate an out= variant for: {str(base_fn.func)}.
+This type of operators don't have tensor-like return, making it difficult to generate a proper out= variant. If
+out= variant is not needed, please add the function name into FUNCTIONAL_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT list."""
+                    )
+
+            # Generate an out= variant
+            if gets_out_variant:
+                fn, metadata = generate_function(base_fn, SchemaKind.out)
+                d[SchemaKind.out] = fn
+                BackendIndex.grow_index(indices, metadata)
+                rs.append(fn)
+
+            # Generate a functional variant, but only do it if the operator got an out= variant
+            # (Functional variants are only useful if we can group up the variants,
+            # which we can only do if they have an out= variant)
+            if not has_functional and (has_out or gets_out_variant):
+                fn, metadata = generate_function(base_fn, SchemaKind.functional)
+                d[SchemaKind.functional] = fn
+                BackendIndex.grow_index(indices, metadata)
+                rs.append(fn)
+
+
+def return_str(rets: Tuple[Return, ...], names: List[str]) -> str:
+    assert len(rets) == len(names)
+    if len(rets) == 0:
+        return ""
+    elif len(rets) == 1:
+        return f"return {names[0]};"
+    else:
+        return f"return {dispatcher.returns_type(rets).cpp_type()}({', '.join(names)});"
+
+
+# Given a function, and the name of a variable corresponding to the output of that function,
+# gather up all of the individual returns that are not aliased
+def gather_nonaliased_inner_rets(func: FunctionSchema, out_var: str) -> List[str]:
+    aliased_rets = func.aliased_return_names()
+    non_aliased_names = []
+    is_out_var_a_tuple = len(func.returns) > 1
+    for i, r in enumerate(aliased_rets):
+        if r is None:
+            non_aliased_names.append(
+                f"std::get<{i}>({out_var})" if is_out_var_a_tuple else out_var
+            )
+    return non_aliased_names
+
+
+# Generates functional kernels in terms of their inplace.mutable counterparts.
+# We only do this for "generated" NativeFunctions
+@with_native_function
+def gen_composite_functional_kernel(g: NativeFunctionsGroup) -> Optional[str]:
+    # We should only be generating these for code-generated NativeFunctions
+    if "generated" not in g.functional.tags:
+        return None
+    # And we always write the kernel for a generated op in terms of a non-generated op.
+    if g.inplace is not None and "generated" not in g.inplace.tags:
+        target_f = g.inplace
+    elif g.mutable is not None and "generated" not in g.mutable.tags:
+        target_f = g.mutable
+    else:
+        # We should be guaranteed to have a valid inplace/mutable variant to call into.
+        # See Note: [Mutable Ops Not Using Functionalization]
+        raise AssertionError(str(g.functional.func))
+
+    sig = DispatcherSignature(g.functional.func)
+    target_sig = DispatcherSignature(target_f.func)
+
+    context: List[Union[Binding, Expr]] = []
+    clone_mutable_inputs = []
+    cloned_return_names = []
+    # We can't just directly pass all of the arguments from the functional op into the mutating op.
+    # We need to check for which inputs to the mutating operator are mutable,
+    # and clone those inputs first.
+    for a_curr, a_tgt in zip(
+        dispatcher.jit_arguments(g.functional.func),
+        dispatcher.jit_arguments(target_f.func),
+    ):
+        if a_tgt.annotation is not None and a_tgt.annotation.is_write:
+            clone_mutable_inputs.append(
+                f"auto {a_curr.name}_clone = clone_arg({a_curr.name});"
+            )
+            context.append(
+                Expr(
+                    expr=f"{a_curr.name}_clone",
+                    type=dispatcher.argument_type(a_curr, binds=a_curr.name),
+                )
+            )
+            # Invariant: mutable arguments on the inner mutable op are always returns on the functional op.
+            cloned_return_names.append(f"{a_curr.name}_clone")
+        else:
+            context.append(dispatcher.argument(a_curr))
+    exprs = ", ".join([e.expr for e in translate(context, target_sig.arguments())])
+
+    out_name = "output"
+    maybe_assign = f"auto {out_name} = " if len(target_f.func.returns) > 0 else ""
+    inner_return_names = gather_nonaliased_inner_rets(target_f.func, out_name)
+    ret_str = return_str(
+        g.functional.func.returns, inner_return_names + cloned_return_names
+    )
+
+    clone_mutable_inputs_str = "\n".join(clone_mutable_inputs)
+    return f"""
+{sig.defn(name=sig.name() + ("_symint" if g.out.func.has_symint() else ""))} {{
+  {clone_mutable_inputs_str}
+  {maybe_assign}at::_ops::{target_f.func.name.unambiguous_name()}::call({exprs});
+  {ret_str}
+}}
+"""
+
+
+# Generates out= kernels in terms of their functional counterparts.
+# We only do this for "generated" NativeFunctions
+@with_native_function
+def gen_composite_out_kernel(g: NativeFunctionsGroup) -> Optional[str]:
+    # We should only be generating these for code-generated NativeFunctions
+    if "generated" not in g.out.tags:
+        return None
+    # And we always write the kernel for the out= op in terms of the functional.
+    # Note that the functional op might have also been generated, but we don't have to
+    # worry about cycles, because the generated functional kernels are always implemented
+    # in terms of non-generated kernels (see gen_composite_functional_kernel).
+
+    sig = DispatcherSignature(g.out.func)
+    target_sig = DispatcherSignature(g.functional.func)
+
+    exprs = ", ".join(
+        [e.expr for e in translate(sig.arguments(), target_sig.arguments())]
+    )
+
+    copy_outs = []
+    out_name = "tmp_output"
+    for i, out_arg in enumerate(g.out.func.arguments.out):
+        functional_return_name = (
+            out_name
+            if len(g.functional.func.returns) == 1
+            else f"std::get<{i}>({out_name})"
+        )
+        copy_outs.append(
+            f"""\
+  resize_out_helper({out_arg.name}, {functional_return_name});
+  copy_arg({out_arg.name}, {functional_return_name});"""
+        )
+
+    rets = []
+    # For each return arg in the calling (out=) operator,
+    # If it corresponds to an aliased input, return the input.
+    # Otherwise, return the corresponding output from calling the functional operator.
+    for i, ret_name in enumerate(g.out.func.aliased_return_names()):
+        if ret_name is not None:
+            rets.append(ret_name)
+        else:
+            functional_return_name = (
+                out_name
+                if len(g.functional.func.returns) == 1
+                else f"std::get<{i}>({out_name})"
+            )
+            rets.append(functional_return_name)
+
+    copy_outs_str = "\n".join(copy_outs)
+
+    # Kernel name needs to follow the naming convention defined in `generate_function()`
+    return f"""
+{sig.defn(name=g.out.func.name.unambiguous_name() + ("_symint" if g.out.func.has_symint() else ""))} {{
+  auto {out_name} = at::_ops::{g.functional.func.name.unambiguous_name()}::call({exprs});
+  {copy_outs_str}
+  {return_str(g.out.func.returns, rets)}
+}}
+"""
diff --git a/MLPY/Lib/site-packages/torchgen/operator_versions/__init__.py b/MLPY/Lib/site-packages/torchgen/operator_versions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchgen/operator_versions/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/operator_versions/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ed7ff0b7aaf6e68e4adf9bff02bc9c909becb4d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/operator_versions/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/operator_versions/__pycache__/gen_mobile_upgraders.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/operator_versions/__pycache__/gen_mobile_upgraders.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52e3e810fcd6449525d16c3fd824297af4007dd2
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/operator_versions/__pycache__/gen_mobile_upgraders.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/operator_versions/__pycache__/gen_mobile_upgraders_constant.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/operator_versions/__pycache__/gen_mobile_upgraders_constant.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2106e0f14b6e5cca5bb4ddb8a4fd403b6bcc58d5
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/operator_versions/__pycache__/gen_mobile_upgraders_constant.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/operator_versions/gen_mobile_upgraders.py b/MLPY/Lib/site-packages/torchgen/operator_versions/gen_mobile_upgraders.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3a4ba1ee97cdcf2d0cbb253faafd16d58b393d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/operator_versions/gen_mobile_upgraders.py
@@ -0,0 +1,392 @@
+#!/usr/bin/env python3
+import os
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List
+
+import torch
+from torch.jit.generate_bytecode import generate_upgraders_bytecode
+
+from torchgen.code_template import CodeTemplate
+from torchgen.operator_versions.gen_mobile_upgraders_constant import (
+    MOBILE_UPGRADERS_HEADER_DESCRIPTION,
+)
+
+
+class ByteCode(Enum):
+    instructions = 1
+    constants = 2
+    types = 3
+    operators = 4
+    register_size = 5
+
+
+EXCLUDED_OP_SET = [
+    "aten::full.names",
+    "aten::full.out",
+    "aten::full",
+]
+
+EXCLUE_UPGRADER_SET = ["full_0_4", "full_out_0_4"]
+
+ONE_INSTRUCTION = CodeTemplate(
+    """
+    Instruction{OpCode::${operator_name}, ${X}, ${N}},"""
+)
+
+INSTRUCTION_LIST = CodeTemplate(
+    """std::vector<Instruction>({
+        ${instruction_list}
+    }), // instructions list"""
+)
+
+ONE_CONSTANT = CodeTemplate(
+    """
+    c10::IValue(${constant}),"""
+)
+
+CONSTANT_LIST = CodeTemplate(
+    """std::vector<c10::IValue>({
+        ${constant_list}
+    }), // constants list"""
+)
+
+CONSTANTS_LIST_EMPTY = """std::vector<c10::IValue>(), // constants list"""
+
+ONE_TYPE = CodeTemplate("""c10::parseType("${type_str}"),""")
+
+TYPE_LIST = CodeTemplate(
+    """std::vector<c10::TypePtr>({
+        ${type_list}
+    }), // types list"""
+)
+
+TYPE_LIST_EMPTY = """std::vector<c10::TypePtr>(), // types list"""
+
+ONE_OPERATOTR_STRING = CodeTemplate(
+    """
+    OperatorString({"${operator_name}", "${overload_name}", ${num_of_args}}),"""
+)
+
+OPERATOR_STRING_LIST = CodeTemplate(
+    """
+    std::vector<OperatorString>({
+        ${operator_string_list}
+    }), // operators list"""
+)
+
+ONE_UPGRADER_FUNCTION = CodeTemplate(
+    """
+    mobile::Function::registerFunc(
+        "${upgrader_name}",
+        ${instruction_list},
+        ${constant_list},
+        ${type_list},
+        ${register_size}
+    )"""
+)
+
+ONE_UPGRADER_SRC = CodeTemplate(
+    """
+    ByteCodeFunctionWithOperator({
+        ${bytecode_function},
+        ${operator_string_list}
+    }),"""
+)
+
+
+ONE_UPGRADER_IN_VERSION_MAP = CodeTemplate(
+    """Upgrader({${upgrader_min_version}, ${upgrader_max_version}, "${upgrader_name}", ${bytecode_func_index}})"""
+)  # noqa: E501
+
+ONE_OPERATOR_IN_VERSION_MAP = CodeTemplate(
+    """
+    {std::string("${operator_name}"),
+        std::vector<Upgrader>({
+            ${upgrader_list_in_version_map}
+        })},"""
+)
+
+
+OPERATOR_VERSION_MAP = CodeTemplate(
+    """
+const std::unordered_map<std::string, std::vector<Upgrader>>
+getOperatorVersionMapForMobile() {
+  static std::unordered_map<std::string, std::vector<Upgrader>>
+        operatorVersionMapForMobile({
+            ${operator_list_in_version_map}
+      });
+  return operatorVersionMapForMobile;
+}
+"""
+)
+
+
+UPGRADER_CPP_SRC = CodeTemplate(
+    MOBILE_UPGRADERS_HEADER_DESCRIPTION
+    + """
+#include <caffe2/serialize/versions.h>
+#include <torch/csrc/jit/mobile/upgrader_mobile.h>
+
+namespace c10 {
+TypePtr parseType(const std::string& pythonStr);
+} // namespace c10
+
+namespace torch {
+namespace jit {
+
+// clang-format off
+
+// From operator_versions_map
+${operator_version_map}
+
+const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
+  auto generate_upgrader_bytecode_list = []() {
+    std::vector<ByteCodeFunctionWithOperator> upgrader_function_list({
+               ${upgrader_bytecode}
+            });
+    for (const auto& upgrader_function : upgrader_function_list) {
+      for (const auto& op : upgrader_function.operators) {
+        upgrader_function.function.append_operator(
+            op.name,
+            op.overload_name,
+            op.num_specified_args);
+      }
+    }
+    return upgrader_function_list;
+  };
+  static std::vector<ByteCodeFunctionWithOperator> upgraderBytecodeList =
+      generate_upgrader_bytecode_list();
+  return upgraderBytecodeList;
+}
+
+// clang-format on
+
+} // namespace jit
+} // namespace torch
+"""
+)
+
+UPGRADER_MOBILE_FILE_NAME = "upgrader_mobile.cpp"
+
+UPGRADER_ELEMENT = CodeTemplate(
+    """\
+Upgrader({${min_version}, ${max_version}, ${operator_name}, ${index}}),
+"""
+)
+
+PER_OPERATOR_UPGRADER_LIST = CodeTemplate(
+    """\
+{
+  std::string(${operator_name}),
+  std::vector<Upgrader>({${upgrader_list}});
+}
+"""
+)
+
+
+def construct_instruction(instruction_list_from_yaml: List[Any]) -> str:
+    instruction_list_part = []
+    for instruction in instruction_list_from_yaml:
+        instruction_list_part.append(
+            ONE_INSTRUCTION.substitute(
+                operator_name=instruction[0],
+                X=instruction[1],
+                N=instruction[2],
+            )
+        )
+    return INSTRUCTION_LIST.substitute(
+        instruction_list="".join(instruction_list_part).lstrip("\n")
+    )
+
+
+def construct_constants(constants_list_from_yaml: List[Any]) -> str:
+    constants_list_part = []
+    for constant_from_yaml in constants_list_from_yaml:
+        convert_constant = None
+        if isinstance(constant_from_yaml, str):
+            # Add quotes if it's string
+            convert_constant = f'"{constant_from_yaml}"'
+        elif isinstance(constant_from_yaml, bool):
+            convert_constant = "true" if constant_from_yaml else "false"
+        elif constant_from_yaml is None:
+            convert_constant = ""
+        elif isinstance(constant_from_yaml, int):
+            convert_constant = str(constant_from_yaml)
+        else:
+            raise ValueError(
+                f"The type of {constant_from_yaml} is {type(constant_from_yaml)}. "
+                "Please add change in construct_constants function in gen_mobile_upgraders.py."
+            )
+        constants_list_part.append(ONE_CONSTANT.substitute(constant=convert_constant))
+    if len(constants_list_part) == 0:
+        return CONSTANTS_LIST_EMPTY
+    return CONSTANT_LIST.substitute(
+        constant_list="".join(constants_list_part).lstrip("\n")
+    )
+
+
+def construct_operators(operator_list_from_yaml: List[Any]) -> str:
+    operator_list_part = []
+    for operator in operator_list_from_yaml:
+        operator_list_part.append(
+            ONE_OPERATOTR_STRING.substitute(
+                operator_name=operator[0],
+                overload_name=operator[1],
+                num_of_args=operator[2],
+            )
+        )
+    return OPERATOR_STRING_LIST.substitute(
+        operator_string_list="".join(operator_list_part).lstrip("\n")
+    )
+
+
+def construct_types(types_tr_list_from_yaml: List[Any]) -> str:
+    types_tr_list_part = []
+    for types_tr in types_tr_list_from_yaml:
+        types_tr_list_part.append(ONE_TYPE.substitute(type_str=types_tr))
+    if len(types_tr_list_part) == 0:
+        return TYPE_LIST_EMPTY
+    return TYPE_LIST.substitute(type_list="".join(types_tr_list_part).lstrip("\n"))
+
+
+def construct_register_size(register_size_from_yaml: int) -> str:
+    if not isinstance(register_size_from_yaml, int):
+        raise ValueError(
+            f"Input register size is {register_size_from_yaml} and"
+            "it's type is {type(register_size_from_yaml)}. An int type is expected."
+        )
+    return str(register_size_from_yaml)
+
+
+def construct_version_maps(
+    upgrader_bytecode_function_to_index_map: Dict[str, Any]
+) -> str:
+    version_map = torch._C._get_operator_version_map()
+    sorted_version_map_ = sorted(version_map.items(), key=lambda item: item[0])  # type: ignore[no-any-return]
+    sorted_version_map = dict(sorted_version_map_)
+
+    operator_list_in_version_map_part = []
+    for op_name in sorted_version_map:
+        upgraders_in_version_map_part = []
+        # TODO: remove the skip after these two operators schemas are fixed
+        if op_name in EXCLUDED_OP_SET:
+            continue
+        upgrader_ranges = torch._C._get_upgrader_ranges(op_name)
+        upgrader_entries = sorted_version_map[op_name]
+        assert len(upgrader_ranges) == len(upgrader_entries)
+        for idx, upgrader_entry in enumerate(upgrader_entries):
+            upgrader_name = upgrader_entry.upgrader_name
+            bytecode_function_index = upgrader_bytecode_function_to_index_map[
+                upgrader_name
+            ]
+            upgraders_in_version_map_part.append(
+                ONE_UPGRADER_IN_VERSION_MAP.substitute(
+                    upgrader_min_version=upgrader_ranges[idx].min_version,
+                    upgrader_max_version=upgrader_ranges[idx].max_version,
+                    upgrader_name=upgrader_name,
+                    bytecode_func_index=bytecode_function_index,
+                )
+            )
+        operator_list_in_version_map_part.append(
+            ONE_OPERATOR_IN_VERSION_MAP.substitute(
+                operator_name=op_name,
+                upgrader_list_in_version_map="".join(upgraders_in_version_map_part),
+            )
+        )
+    return OPERATOR_VERSION_MAP.substitute(
+        operator_list_in_version_map="".join(operator_list_in_version_map_part).lstrip(
+            "\n"
+        )
+    )
+
+
+def get_upgrader_bytecode_function_to_index_map(
+    upgrader_dict: List[Dict[str, Any]]
+) -> Dict[str, Any]:
+    upgrader_bytecode_function_to_index_map = {}
+    index = 0
+    for upgrader_bytecode in upgrader_dict:
+        for upgrader_name in upgrader_bytecode.keys():
+            if upgrader_name in EXCLUE_UPGRADER_SET:
+                continue
+            upgrader_bytecode_function_to_index_map[upgrader_name] = index
+            index += 1
+    return upgrader_bytecode_function_to_index_map
+
+
+def write_cpp(cpp_path: str, upgrader_dict: List[Dict[str, Any]]) -> None:
+    body_parts = []
+    upgrader_bytecode_function_to_index_map = (
+        get_upgrader_bytecode_function_to_index_map(upgrader_dict)
+    )
+    version_map_src = construct_version_maps(upgrader_bytecode_function_to_index_map)
+    all_upgrader_src_string = []
+    for upgrader_bytecode in upgrader_dict:
+        for upgrader_name, bytecode in upgrader_bytecode.items():
+            # TODO: remove the skip after these two operators schemas are fixed
+            if upgrader_name in EXCLUE_UPGRADER_SET:
+                continue
+            instruction_list_str = ""
+            constant_list_str = ""
+            type_list_str = ""
+            register_size_str = ""
+            operator_list_str = ""
+            for table_name, contents in bytecode.items():
+                element = ByteCode[table_name]
+                body_string = ""
+                if element is ByteCode.instructions:
+                    instruction_list_str = construct_instruction(contents)
+                elif element is ByteCode.constants:
+                    constant_list_str = construct_constants(contents)
+                elif element is ByteCode.operators:
+                    operator_list_str = construct_operators(contents)
+                elif element is ByteCode.types:
+                    type_list_str = construct_types(contents)
+                elif element is ByteCode.register_size:
+                    register_size_str = construct_register_size(contents)
+
+            one_upgrader_function_string = ONE_UPGRADER_FUNCTION.substitute(
+                upgrader_name=upgrader_name,
+                instruction_list=instruction_list_str,
+                constant_list=constant_list_str,
+                type_list=type_list_str,
+                register_size=register_size_str,
+            )
+            one_upgrader_src_string = ONE_UPGRADER_SRC.substitute(
+                bytecode_function=one_upgrader_function_string.lstrip("\n"),
+                operator_string_list=operator_list_str.lstrip("\n"),
+            )
+            all_upgrader_src_string.append(one_upgrader_src_string)
+
+    upgrader_file_content = UPGRADER_CPP_SRC.substitute(
+        operator_version_map=version_map_src,
+        upgrader_bytecode="".join(all_upgrader_src_string).lstrip("\n"),
+    )
+    body_parts.append(upgrader_file_content)
+    print("writing file to : ", cpp_path + "/" + UPGRADER_MOBILE_FILE_NAME)
+    with open(os.path.join(cpp_path, UPGRADER_MOBILE_FILE_NAME), "wb") as out_file:
+        final_output = "".join(body_parts)
+        out_file.write(upgrader_file_content.encode("utf-8"))
+
+
+def sort_upgrader(upgrader_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    sorted_upgrader_list = sorted(
+        upgrader_list, key=lambda one_upgrader: next(iter(one_upgrader))
+    )
+    return sorted_upgrader_list
+
+
+def main() -> None:
+    upgrader_list = generate_upgraders_bytecode()
+    sorted_upgrader_list = sort_upgrader(upgrader_list)
+    for up in sorted_upgrader_list:
+        print("after sort upgrader : ", next(iter(up)))
+
+    pytorch_dir = Path(__file__).resolve().parents[2]
+    upgrader_path = pytorch_dir / "torch" / "csrc" / "jit" / "mobile"
+    write_cpp(str(upgrader_path), sorted_upgrader_list)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torchgen/operator_versions/gen_mobile_upgraders_constant.py b/MLPY/Lib/site-packages/torchgen/operator_versions/gen_mobile_upgraders_constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..923e39c4891e0562df75652d05673c4e393aff1b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/operator_versions/gen_mobile_upgraders_constant.py
@@ -0,0 +1,7 @@
+MOBILE_UPGRADERS_HEADER_DESCRIPTION = """/**
+ * @generated
+ * This is an auto-generated file. Please do not modify it by hand.
+ * To re-generate, please run:
+ * cd ~/pytorch && python torchgen/operator_versions/gen_mobile_upgraders.py
+ */
+"""
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/native/native_functions.yaml b/MLPY/Lib/site-packages/torchgen/packaged/ATen/native/native_functions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e39203afa5acc4c7b4de70c04fe9eb1af820be4c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/native/native_functions.yaml
@@ -0,0 +1,15514 @@
+# See README.md in this directory for more guidance
+
+# *********NB: _cast_* operators are DEPRECATED and will be removed
+# eventually. These were previously used before TorchScript IR supported
+# representing ScalarType's. They are now superseded by usage of
+# `aten::to()`. The ops remain here for backward compatibility purposes.
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# Computes the gradient of current tensor w.r.t. graph leaves.
+- func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
+  manual_cpp_binding: True
+  variants: method
+
+# DEPRECATED. Sets the tensor data held by this `Variable` to be the same as
+# `new_data`.  It requires that `new_data` and `Variable` have compatible tensor
+# type, by checking `_has_compatible_shallow_copy_type(this, new_data)`.
+#
+# This function is deprecated because it doesn't really make sense in a world
+# where Variables *are* Tensors (as opposed to them containing tensors, which
+# is what the previous interpretation was.)
+- func: set_data(Tensor(a!) self, Tensor new_data) -> ()
+  manual_cpp_binding: True
+  variants: method
+
+- func: data(Tensor self) -> Tensor
+  manual_cpp_binding: True
+  variants: method
+
+# True if this `Variable` is a leaf and thus does not have a `grad_fn`.
+- func: is_leaf(Tensor self) -> bool
+  manual_cpp_binding: True
+  variants: method
+
+# Returns the output index of this variable from the forward operation that
+# produced it.  Conversely, it returns the input index of the gradient `Node` to
+# which this `Variable` is connected (because in the gradient computation,
+# inputs and outputs switch meaning).  For example:
+#
+#   y0, y1, y2 = f(x)
+#   assert y0.output_nr == 0
+#   assert y1.output_nr == 1
+#   assert y2.output_nr == 2
+#
+- func: output_nr(Tensor self) -> int
+  manual_cpp_binding: True
+  variants: method
+
+- func: _version(Tensor self) -> int
+  manual_cpp_binding: True
+  variants: method
+
+- func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!)
+  manual_cpp_binding: True
+  variants: method
+
+# Enables .grad attribute for non-leaf Tensors.
+- func: retain_grad(Tensor(a!) self) -> ()
+  manual_cpp_binding: True
+  variants: method
+
+- func: retains_grad(Tensor self) -> bool
+  manual_cpp_binding: True
+  variants: method
+
+- func: _fw_primal(Tensor(a) self, int level) -> Tensor(a)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: _fw_primal
+
+- func: _make_dual(Tensor(a) primal, Tensor tangent, int level) -> Tensor(a)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _make_dual
+
+- func: _unpack_dual(Tensor(a) dual, int level) -> (Tensor(a) primal, Tensor tangent)
+  variants: function
+
+# NOTE: [_new_zeros_with_same_feature_meta]
+# This function creates a new tensor with the layout and TensorOptions
+# of `other` but also takes into account the batch dimensions of `self`
+#
+# This function has a couple extra constraints because it is also used for `jvp`
+# in functorch.
+# - is used for forward AD because there is the restriction
+#   that the primal and tangent must have the same layout
+# - We cannot assume that `self` and `other` have the same sizes or even dim
+#   because in the inplace over view case, `other` is the base tensor, and
+#   `self` is the forward grad with respect to the view, which can have an
+#   entirely different shape
+# - takes the number of batch dims for `self` because we also handle
+#   some batching logic. We handle that here instead of a batching rule because
+#   we'd like to avoid calling as_strided in the batching rule (as to enable
+#   nested vmap in functorch).
+# - needs to be CompositeExplicitAutograd for jvp support in functorch.
+#   functorch currently relies on TensorWrapper which does not have storage
+#   CompositeExplicitAutograd makes sure the TensorWrapper is unwrapped.
+# - this function may eventually take on another int argument to store the
+#   the number of batch dims for other once we support that use case
+- func: _new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _new_zeros_with_same_feature_meta
+  autogen: _new_zeros_with_same_feature_meta.out
+
+# This function compares the storage numel of self with that of other, where
+# storage numel is computed as: `other.storage().nbytes() / other.itemsize()`.
+# We create this function for composite compliance purposes. The batching rule
+# always returns true because vmapped as_strided does not support accessing
+# storage locations not indexable by the input tensor.
+# See the note above for more information.
+- func: _has_same_storage_numel(Tensor self, Tensor other) -> bool
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _has_same_storage_numel
+
+- func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
+  variants: method
+  tags: inplace_view
+
+- func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
+  variants: method
+
+- func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a)
+  variants: method
+
+- func: align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a)
+  variants: method
+
+- func: align_as(Tensor self, Tensor other) -> Tensor
+  variants: method
+
+- func: align_tensors(Tensor[] tensors) -> Tensor[]
+
+# Not assert because it's a keyword; not Assert because FX already
+# took that syntax
+# TODO: need to specify this is side-effectful somehow
+- func: _assert_async(Tensor self) -> ()
+  dispatch:
+    CPU: _assert_async_cpu
+    CUDA: _assert_async_cuda
+
+- func: _assert_async.msg(Tensor self, str assert_msg) -> ()
+  dispatch:
+    CPU: _assert_async_msg_cpu
+    CUDA: _assert_async_msg_cuda
+
+- func: _assert_scalar(Scalar self, str assert_msg) -> ()
+  dispatch:
+    CompositeExplicitAutograd: _assert_scalar
+
+- func: _functional_assert_scalar(Scalar self, str assert_msg, Tensor dep_token) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _functional_assert_scalar
+
+- func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor
+  dispatch:
+    CPU: _functional_assert_async_msg_cpu
+
+- func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
+
+- func: _print(str s) -> ()
+  dispatch:
+    CompositeExplicitAutograd: _print
+
+- func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()
+  dispatch:
+    CompositeExplicitAutograd: sym_constrain_range
+
+- func: sym_constrain_range_for_size(Scalar size, *, int? min=None, int? max=None) -> ()
+  dispatch:
+    CompositeExplicitAutograd: sym_constrain_range_for_size
+
+- func: _functional_sym_constrain_range(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _functional_sym_constrain_range
+
+- func: _functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _functional_sym_constrain_range_for_size
+
+- func: _make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    CPU: _make_dep_token_cpu
+
+- func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
+  variants: method
+
+- func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
+  device_check: NoCheck  # Tensor arguments allowed to be on different devices, see also _cudnn_ctc_loss
+  dispatch:
+    CUDA: _use_cudnn_ctc_loss
+
+- func: _use_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool
+  device_check: NoCheck  # Tensor arguments allowed to be on different devices, see also _cudnn_ctc_loss
+  dispatch:
+    CUDA: _use_cudnn_ctc_loss_tensor
+
+- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
+  device_check: NoCheck  # log_probs is expected to be on CUDA while targets is expected to be on CPU
+  dispatch:
+    CUDA: _cudnn_ctc_loss
+  autogen: _cudnn_ctc_loss.out
+
+- func: _cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
+  device_check: NoCheck  # log_probs is expected to be on CUDA while targets is expected to be on CPU
+  dispatch:
+    CUDA: _cudnn_ctc_loss_tensor
+
+- func: _use_cudnn_rnn_flatten_weight() -> bool
+
+- func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor
+  dispatch:
+    CUDA: _cudnn_rnn_flatten_weight
+  autogen: _cudnn_rnn_flatten_weight.out
+
+- func: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  # rnn_tanh may or may not redispatch to _cudnn_rnn based on algorithm and build. Thus it might hit dispatch or kernel device check.
+  # Disable dispatch time device check for consistent behavior.
+  device_check: NoCheck
+  dispatch:
+    CUDA: _cudnn_rnn
+  autogen: _cudnn_rnn.out
+  tags: nondeterministic_seeded
+
+- func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+  dispatch:
+    CUDA: _cudnn_rnn_backward
+  autogen: _cudnn_rnn_backward.out
+
+- func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    CUDA: _cudnn_init_dropout_state
+  autogen: _cudnn_init_dropout_state.out
+  tags: nondeterministic_seeded
+
+- func: _debug_has_internal_overlap(Tensor self) -> int
+  variants: function
+
+- func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: fused_dropout_cuda
+  tags: nondeterministic_seeded
+  autogen: _fused_dropout.out
+
+- func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: masked_scale_cuda
+  autogen: _masked_scale.out
+
+- func: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: native_dropout_cpu
+    CUDA: native_dropout_cuda
+    NestedTensorCPU, NestedTensorCUDA: native_dropout_nested
+  tags: [nondeterministic_seeded, core]
+  autogen: native_dropout.out
+
+- func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
+  dispatch:
+    CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward
+    CUDA: native_dropout_backward_cuda
+  autogen: native_dropout_backward.out
+  tags: pointwise
+
+- func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
+
+- func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)
+
+- func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!)
+
+- func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!)
+
+- func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor
+
+- func: _shape_as_tensor(Tensor self) -> Tensor
+
+- func: dropout(Tensor input, float p, bool train) -> Tensor
+  tags: nondeterministic_seeded
+
+- func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: feature_dropout(Tensor input, float p, bool train) -> Tensor
+  tags: nondeterministic_seeded
+
+- func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: alpha_dropout(Tensor input, float p, bool train) -> Tensor
+  tags: nondeterministic_seeded
+
+- func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor
+  tags: nondeterministic_seeded
+
+- func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: abs(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: abs
+    SparseCPU, SparseCUDA: abs_sparse
+    SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs
+  tags: [core, pointwise]
+
+- func: abs_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: abs_
+    SparseCPU, SparseCUDA: abs_sparse_
+    SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_
+
+- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: abs_out
+    MPS: abs_out_mps
+    SparseCPU, SparseCUDA: abs_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out
+  tags: pointwise
+
+# Note [Adding an alias]
+# To add an alias do the following:
+#
+# 1) Copy the original functions native_functions.yaml entry, but replace the
+#      original function's name with their own and delete any dispatch
+#      keys for the aliases. Specifying a dispatch key will prevent
+#      autograd from recording the operations the alias performs, which
+#      will stop it from "inheriting" the original operation's autograd behavior.
+# 2) Implement the corresponding functions and have them redispatch to the
+#      original function.
+# 3) Add docstrings to the new function that reference the original function,
+#      and document the method as usual (if it exists.)
+#    (See torch/_torch_docs.py and docs/source/torch.rst if adding a function,
+#     torch/_tensor_docs.py and docs/source/tensors.rst if adding a method,
+#     or module-specific doc bindings (like torch/linalg/__init__.py) if
+#     adding an alias in a namespace.)
+# 4) Update torch/overrides.py consistent with the original function.
+# 5) Update the alias_map in torch/csrc/jit/passes/normalize_ops.cpp.
+# 6) Add aliases argument to existing OpInfo/UnaryUfuncInfo or create new OpInfo/UnaryUfuncInfo entry
+# in op_db list in torch/testing/_internal/common_methods_invocations.py
+#
+# See torch.absolute, an alias for torch.abs, as an example.
+# Absolute, alias for abs
+
+- func: absolute(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: absolute_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: angle(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: angle
+    SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr
+  tags: pointwise
+
+- func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: angle_out
+    SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr_out
+  tags: pointwise
+
+- func: view_as_real(Tensor(a) self) -> Tensor(a)
+  variants: function
+  dispatch:
+    CPU, CUDA, MPS, Meta: view_as_real
+
+- func: view_as_complex(Tensor(a) self) -> Tensor(a)
+  variants: function
+  dispatch:
+    CPU, CUDA, MPS, Meta: view_as_complex
+
+- func: sgn(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: sgn.out
+  dispatch:
+    SparseCPU, SparseCUDA: sgn_sparse
+    SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn
+  tags: pointwise
+
+- func: sgn_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+  structured_delegate: sgn.out
+  dispatch:
+    SparseCPU, SparseCUDA: sgn_sparse_
+    SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_
+  tags: pointwise
+
+- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sgn_out
+    MPS: sgn_out_mps
+    SparseCPU, SparseCUDA: sgn_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out
+  tags: pointwise
+
+- func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+  variants: method
+
+- func: real(Tensor(a) self) -> Tensor(a)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: imag(Tensor(a) self) -> Tensor(a)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: _conj(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _conj
+
+- func: conj(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  manual_cpp_binding: True
+
+- func: _conj_physical(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _conj_physical
+    SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr
+  autogen: _conj_physical.out
+
+- func: conj_physical(Tensor self) -> Tensor
+  variants: function, method
+  tags: pointwise
+
+- func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: conj_physical_out
+    MPS: conj_physical_out_mps
+    SparseCPU, SparseCUDA: conj_physical_out_sparse
+    SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out
+  tags: pointwise
+
+- func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: conj_physical_
+    SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_
+  tags: pointwise
+
+- func: resolve_conj(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+
+- func: resolve_neg(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+
+- func: _neg_view(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _neg_view
+
+- func: acos(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: acos.out
+  tags: [core, pointwise]
+
+- func: acos_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: acos.out
+  tags: pointwise
+
+- func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: acos_out
+    MPS: acos_out_mps
+  tags: pointwise
+
+# arccos, alias of acos
+- func: arccos(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arccos_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
+  tags: core
+
+- func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor
+  tags: core
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
+
+- func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: add.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: add_sparse
+    SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
+    MkldnnCPU: mkldnn_add
+    ZeroTensor: add_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
+  tags: [core, pointwise]
+
+- func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: add.out
+  dispatch:
+    SparseCPU, SparseCUDA: add_sparse_
+    SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
+    MkldnnCPU: mkldnn_add_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
+  tags: pointwise
+
+- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  ufunc_inner_loop:
+    Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
+    ScalarOnly: add (Bool)
+  dispatch:
+    SparseCPU: add_out_sparse_cpu
+    SparseCUDA: add_out_sparse_cuda
+    SparseCsrCPU: add_out_sparse_compressed_cpu
+    SparseCsrCUDA: add_out_sparse_compressed_cuda
+    MkldnnCPU: mkldnn_add_out
+    MPS: add_out_mps
+  tags: pointwise
+
+- func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: add_relu
+
+- func: _add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: add_relu_
+
+- func: _add_relu.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: add_relu_out
+
+- func: _add_relu.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: add_relu
+
+- func: _add_relu_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: add_relu_
+  autogen: _add_relu.Scalar_out
+
+# For C++ only, until we have conversion from C++ numbers to Tensor
+- func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: add
+  tags: [core, pointwise]
+
+- func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: add_
+  autogen: add.Scalar_out
+  tags: pointwise
+
+- func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  structured_delegate: addmv.out
+  variants: function, method
+
+- func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  structured_delegate: addmv.out
+  variants: function, method
+
+- func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: addmv_out_cpu
+    CUDA: addmv_out_cuda
+    MPS: addmv_out_mps
+    SparseCsrCPU: addmv_out_sparse_compressed
+    SparseCsrCUDA: addmv_out_sparse_compressed_cuda
+
+- func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, CUDA: addr
+    MPS: addr_mps
+    CompositeExplicitAutograd: math_addr
+
+- func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: addr_
+
+- func: addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addr_out
+    MPS: addr_out_mps
+    CompositeExplicitAutograd: math_addr_out
+
+- func: affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: affine_grid_generator
+  autogen: affine_grid_generator.out
+
+- func: affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor
+  variants: function
+
+- func: _is_all_true(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _is_all_true
+
+- func: _is_any_true(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _is_any_true
+
+# Note: this function is only for testing.
+- func: _test_check_tensor(Tensor self) -> Tensor
+  variants: function
+
+# Note; this function is only for testing
+- func: _test_functorch_fallback(Tensor self, Tensor other) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _test_functorch_fallback
+  autogen: _test_functorch_fallback.out
+
+- func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: all.out
+  variants: function, method
+
+- func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: all.dims_out
+  variants: function, method
+  cpp_no_default_args: ['dim']
+  dispatch:
+    CompositeExplicitAutograd: all_dims_default
+
+- func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: all_out
+    MPS: all_out_mps
+
+- func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: all_dims_out
+    CompositeExplicitAutograd: all_dims_out_default
+  cpp_no_default_args: ['dim']
+
+- func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
+  variants: function, method
+  tags: data_dependent_output
+  dispatch:
+    CompositeExplicitAutograd: allclose
+
+- func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: any.out
+  variants: function, method
+  tags: core
+
+- func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: any.dims_out
+  variants: function, method
+  cpp_no_default_args: ['dim']
+  tags: core
+  dispatch:
+    CompositeExplicitAutograd: any_dims_default
+
+- func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: any_out
+    MPS: any_out_mps
+
+- func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: any_dims_out
+    CompositeExplicitAutograd: any_dims_out_default
+  cpp_no_default_args: ['dim']
+
+- func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: arange
+
+- func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: arange
+
+# This operator should be named `arange.start_out` if following the naming convention. However that
+# name is already taken. Disabled because of CI job failures.
+# FIXME: enable this
+#- func: arange.start_out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+#  dispatch:
+#    CompositeExplicitAutograd: arange_start_out
+
+- func: arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: arange
+  cpp_no_default_args: ['step']
+  tags: core
+
+- func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: arange_out
+
+- func: arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: arange_out
+    CUDA: arange_cuda_out
+    MPS: arange_mps_out
+  cpp_no_default_args: ['step']
+
+# This function is a temporary hack to allow tracing of arange like constructs with dynamic
+# bounds on arange.  Normal arange is not traceable because it does not take any tensor inputs;
+# if the range you need is based on another tensor, calling this function directly will
+# preserve tracing.  Get rid of this when arange can directly take tensors for bounds
+# (so that it can be traced directly).
+- func: _dim_arange(Tensor like, int dim) -> Tensor
+
+- func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
+  structured_delegate: argmax.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU, CUDA: argmax_out
+    MPS: argmax_out_mps
+
+- func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
+  structured_delegate: argmin.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU, CUDA: argmin_out
+    MPS: argmin_out_mps
+
+- func: acosh(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: acosh.out
+  tags: [core, pointwise]
+
+- func: acosh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: acosh.out
+  tags: pointwise
+
+- func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: acosh_out
+    MPS: acosh_out_mps
+  tags: pointwise
+# arccosh, alias for acosh
+
+- func: arccosh(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arccosh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: asinh(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: asinh.out
+  dispatch:
+    SparseCPU, SparseCUDA: asinh_sparse
+    SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr
+  tags: [core, pointwise]
+
+- func: asinh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: asinh.out
+  dispatch:
+    SparseCPU, SparseCUDA: asinh_sparse_
+    SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_
+  tags: pointwise
+
+- func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: asinh_out
+    MPS: asinh_out_mps
+    SparseCPU, SparseCUDA: asinh_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out
+  tags: pointwise
+
+# arcsinh, alias for asinh
+- func: arcsinh(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arcsinh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: atanh(Tensor self) -> Tensor
+  structured_delegate: atanh.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: atanh_sparse
+    SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr
+  tags: [core, pointwise]
+
+- func: atanh_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: atanh.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: atanh_sparse_
+    SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_
+  tags: pointwise
+
+- func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: atanh_out
+    MPS: atanh_out_mps
+    SparseCPU, SparseCUDA: atanh_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out
+  tags: pointwise
+# arctanh, alias for atanh
+
+- func: arctanh(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arctanh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    ZeroTensor, CPU, CUDA: as_strided_tensorimpl
+    Meta: as_strided_tensorimpl_meta_symint
+    MPS: as_strided_tensorimpl_mps
+    QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+
+- func: as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: as_strided__symint
+
+- func: asin(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: asin.out
+  dispatch:
+    SparseCPU, SparseCUDA: asin_sparse
+    SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr
+  tags: [core, pointwise]
+
+- func: asin_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: asin.out
+  dispatch:
+    SparseCPU, SparseCUDA: asin_sparse_
+    SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_
+  tags: pointwise
+
+- func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: asin_out
+    MPS: asin_out_mps
+    SparseCPU, SparseCUDA: asin_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out
+  tags: pointwise
+
+# arcsin, alias of asin
+- func: arcsin(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arcsin_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: atan(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: atan_sparse
+    SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr
+  tags: [core, pointwise]
+
+- func: atan_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: atan_sparse_
+    SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_
+  tags: pointwise
+
+- func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: atan_out
+    MPS: atan_out_mps
+    SparseCPU, SparseCUDA: atan_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out
+  tags: pointwise
+
+# arctan, alias of atan
+- func: arctan(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arctan_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: atleast_1d(Tensor self) -> Tensor
+  variants: function
+
+- func: atleast_1d.Sequence(Tensor[] tensors) -> Tensor[]
+
+- func: atleast_2d(Tensor self) -> Tensor
+  variants: function
+
+- func: atleast_2d.Sequence(Tensor[] tensors) -> Tensor[]
+  variants: function
+
+- func: atleast_3d(Tensor self) -> Tensor
+  variants: function
+
+- func: atleast_3d.Sequence(Tensor[] tensors) -> Tensor[]
+  variants: function
+
+- func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
+  structured_delegate: baddbmm.out
+
+- func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+  structured_delegate: baddbmm.out
+
+- func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU: baddbmm_out_cpu
+    CUDA: baddbmm_out_cuda
+    MPS: baddbmm_out_mps
+    SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
+
+- func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: bartlett_window
+  autogen: bartlett_window.out
+
+- func: bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: bartlett_window
+  autogen: bartlett_window.periodic_out
+
+- func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor
+
+- func: quantized_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor
+  dispatch:
+    QuantizedCPU: quantized_batch_norm
+  autogen: quantized_batch_norm.out
+
+- func: _batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int)
+
+- func: _batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor)
+
+# Sample bernoulli with values in `self` as probability.
+- func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: bernoulli
+  tags: nondeterministic_seeded
+
+- func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: bernoulli_out
+    MPS: bernoulli_out_mps
+
+- func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: bernoulli_
+    MPS: bernoulli_mps_
+  autogen: bernoulli.Tensor, bernoulli.Tensor_out
+
+- func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: bernoulli_
+    MPS: bernoulli_mps_
+  autogen: bernoulli.float_out
+
+# Note [bernoulli.p schema]
+# We should probably just fix the overload ambiguity by appending a _functional to the C++ API name (BC breaking)
+# This out-of-place version isn't used explicitly, but needed by jit.
+# There is no default valid on `p` here because it would introduce ambiguity
+# with `bernoulli(Tensor self, *, Generator? generator=None)` declaration.
+- func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: bernoulli
+
+- func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias=None) -> Tensor
+
+- func: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  variants: function
+  dispatch:
+    CPU: binary_cross_entropy_cpu
+    CUDA: binary_cross_entropy_cuda
+    MPS: binary_cross_entropy_mps
+
+- func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  variants: function
+  dispatch:
+    CPU: binary_cross_entropy_out_cpu
+    CUDA: binary_cross_entropy_out_cuda
+    MPS: binary_cross_entropy_out_mps
+
+- func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  python_module: nn
+  variants: function
+  dispatch:
+    CPU: binary_cross_entropy_backward_cpu
+    CUDA: binary_cross_entropy_backward_cuda
+    MPS: binary_cross_entropy_backward_mps
+
+- func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  variants: function
+  dispatch:
+    CPU: binary_cross_entropy_backward_out_cpu
+    CUDA: binary_cross_entropy_backward_out_cuda
+    MPS: binary_cross_entropy_backward_out_mps
+
+- func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: binary_cross_entropy_with_logits
+  autogen: binary_cross_entropy_with_logits.out
+
+- func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: _bincount_cpu
+    CUDA: _bincount_cuda
+    MPS: _bincount_mps
+  tags: dynamic_output_shape
+  autogen: bincount.out
+
+- func: bitwise_not(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: bitwise_not.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: bitwise_not_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: bitwise_not.out
+  variants: method
+  tags: pointwise
+
+- func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: bitwise_not_out
+    MPS: bitwise_not_out_mps
+  tags: pointwise
+
+- func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA, MPS: copysign_out
+  tags: pointwise
+
+- func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: copysign.out
+  tags: pointwise
+
+- func: copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: copysign.out
+
+- func: copysign.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: copysign
+  tags: pointwise
+
+- func: copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: copysign_
+
+- func: copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: copysign_out
+  tags: pointwise
+
+- func: _lazy_clone(Tensor self) -> Tensor
+  # Like clone, but the copy takes place lazily, only if either the
+  # input or the output are written.
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _lazy_clone
+
+- func: logical_not(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_not
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
+  tags: [core, pointwise]
+
+- func: logical_not_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_not_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
+  tags: pointwise
+
+- func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: logical_not_out
+    MPS: logical_not_out_mps
+  tags: pointwise
+
+- func: logical_xor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_xor
+  tags: [core, pointwise]
+
+- func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_xor_
+  tags: pointwise
+
+- func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: logical_xor_out
+    MPS: logical_xor_out_mps
+  tags: pointwise
+
+- func: logical_and(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_and
+  tags: [core, pointwise]
+
+- func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_and_
+  tags: pointwise
+
+- func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: logical_and_out
+    MPS: logical_and_out_mps
+  tags: pointwise
+
+- func: logical_or(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_or
+  tags: [core, pointwise]
+
+- func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_or_
+  tags: pointwise
+
+- func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: logical_or_out
+    MPS: logical_or_out_mps
+  tags: pointwise
+
+- func: blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: blackman_window
+  autogen: blackman_window.out
+
+- func: blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: blackman_window
+  autogen: blackman_window.periodic_out
+
+- func: bmm(Tensor self, Tensor mat2) -> Tensor
+  structured_delegate: bmm.out
+  variants: function, method
+  dispatch:
+    SparseCPU: bmm_sparse_cpu
+    SparseCUDA: bmm_sparse_cuda
+    NestedTensorCPU: bmm_nested
+    NestedTensorCUDA: bmm_nested_cuda
+  tags: core
+
+- func: bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU: bmm_out_cpu
+    CUDA: bmm_out_cuda
+    MPS: bmm_out_mps
+    SparseCPU: bmm_out_sparse_cpu
+    SparseCUDA: bmm_out_sparse_cuda
+    SparseCsrCUDA: bmm_out_sparse_csr_cuda
+
+- func: broadcast_tensors(Tensor[] tensors) -> Tensor[]
+  device_check: NoCheck
+  device_guard: False
+
+- func: broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: broadcast_to_symint
+
+- func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
+  variants: function
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_broadcast_to
+
+- func: cat(Tensor[] tensors, int dim=0) -> Tensor
+  structured_delegate: cat.out
+  dispatch:
+    SparseCPU, SparseCUDA: cat_sparse
+    QuantizedCPU: cat_quantized_cpu
+    NestedTensorCPU, NestedTensorCUDA: cat_nested
+  tags: core
+
+- func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  precomputed:
+  - dim -> int dim, int valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format
+  dispatch:
+    CPU: cat_out_cpu
+    CUDA: cat_out_cuda
+    MPS: cat_out_mps
+    QuantizedCPU: cat_out_quantized_cpu
+
+- func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor
+
+- func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+
+# alias for torch.cat
+- func: concat(Tensor[] tensors, int dim=0) -> Tensor
+
+- func: concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: concat.names(Tensor[] tensors, Dimname dim) -> Tensor
+
+- func: concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+
+# alias for torch.cat
+- func: concatenate(Tensor[] tensors, int dim=0) -> Tensor
+
+- func: concatenate.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: concatenate.names(Tensor[] tensors, Dimname dim) -> Tensor
+
+- func: concatenate.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: block_diag(Tensor[] tensors) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: block_diag
+  autogen: block_diag.out
+
+- func: ceil(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: ceil.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: ceil_sparse
+    SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr
+  tags: [core, pointwise]
+
+- func: ceil_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: ceil.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: ceil_sparse_
+    SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_
+  tags: pointwise
+
+- func: ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: ceil_out
+    MPS: ceil_out_mps
+    SparseCPU, SparseCUDA: ceil_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out
+  tags: pointwise
+
+# alias for torch.linalg.multi_dot
+- func: chain_matmul(Tensor[] matrices) -> Tensor
+  variants: function
+
+# alias for torch.linalg.multi_dot
+- func: chain_matmul.out(Tensor[] matrices, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: chunk
+    NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor
+
+- func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: tensor_split_sections_symint
+
+- func: tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: tensor_split_indices_symint
+
+- func: tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[]
+  variants: function, method
+
+- func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ['min']
+  structured_delegate: clamp.out
+  dispatch:
+    QuantizedCPU: clamp_quantized_cpu
+  tags: [core, pointwise]
+
+- func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
+  variants: function, method
+  structured_delegate: clamp.Tensor_out
+  tags: [core, pointwise]
+
+- func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ['min']
+  structured_delegate: clamp.out
+  tags: pointwise
+
+- func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: clamp.Tensor_out
+  tags: pointwise
+
+- func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ['min']
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_out
+    MPS: clamp_out_mps
+  tags: pointwise
+
+- func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_Tensor_out
+    MPS: clamp_Tensor_out_mps
+  tags: pointwise
+
+- func: clamp_max(Tensor self, Scalar max) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_max.out
+  tags: pointwise
+
+- func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
+  variants: function, method
+  structured_delegate: clamp_max.Tensor_out
+  tags: pointwise
+
+- func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_max.out
+  tags: pointwise
+
+- func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: clamp_max.Tensor_out
+  tags: pointwise
+
+- func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_max_out
+    MPS: clamp_max_out_mps
+  tags: pointwise
+
+- func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_max_Tensor_out
+    MPS: clamp_max_Tensor_out_mps
+  tags: pointwise
+
+- func: clamp_min(Tensor self, Scalar min) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_min.out
+  tags: pointwise
+
+- func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
+  variants: function, method
+  structured_delegate: clamp_min.Tensor_out
+  tags: pointwise
+
+- func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_min.out
+  tags: pointwise
+
+- func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: clamp_min.Tensor_out
+  tags: pointwise
+
+- func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_min_out
+    MPS: clamp_min_out_mps
+  tags: pointwise
+
+- func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_min_Tensor_out
+    MPS: clamp_min_Tensor_out_mps
+  tags: pointwise
+
+# clip is an alias for clamp
+- func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+  cpp_no_default_args: ['min']
+  variants: function, method
+  tags: pointwise
+
+- func: clip.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
+  variants: function, method
+  tags: pointwise
+
+- func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+  cpp_no_default_args: ['min']
+  variants: function, method
+  tags: pointwise
+
+- func: clip_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
+  variants: function, method
+  tags: pointwise
+
+- func: clip.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  cpp_no_default_args: ['min']
+  tags: pointwise
+
+- func: clip.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: cudnn_is_acceptable(Tensor self) -> bool
+  device_check: NoCheck
+  device_guard: False
+
+- func: complex(Tensor real, Tensor imag) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: complex
+
+- func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: complex_out
+    MPS: complex_out_mps
+
+- func: polar(Tensor abs, Tensor angle) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: polar
+
+- func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: polar_out
+    MPS: polar_out_mps
+
+- func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: constant_pad_nd
+    MPS: constant_pad_nd_mps
+  autogen: constant_pad_nd.out
+  tags: core
+
+- func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)
+  variants: method
+  manual_cpp_binding: True
+
+- func: convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: convolution
+  autogen: convolution.out
+  tags: core
+
+- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CompositeExplicitAutograd, CUDA: convolution_backward
+  autogen: convolution_backward.out
+  tags: core
+
+- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: convolution_overrideable
+  autogen: convolution_overrideable.out
+
+- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+  dispatch:
+    CompositeExplicitAutograd: convolution_backward_overrideable
+  autogen: convolution_backward_overrideable.out
+
+- func: _convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _convolution
+  autogen: _convolution.out
+
+- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor
+
+- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, str padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _convolution_mode_symint
+
+- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+
+- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] dilation=1, SymInt groups=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv1d_symint
+
+- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv2d_symint
+
+- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv3d_symint
+
+- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, str padding="valid", SymInt[1] dilation=1, SymInt groups=1) -> Tensor
+  cpp_no_default_args: ['bias', 'stride', 'padding']
+  dispatch:
+    CompositeImplicitAutograd: conv1d_padding_symint
+
+- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+  cpp_no_default_args: ['bias', 'stride', 'padding']
+  dispatch:
+    CompositeImplicitAutograd: conv2d_padding_symint
+
+- func: conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, str padding="valid", SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+  cpp_no_default_args: ['bias', 'stride', 'padding']
+  dispatch:
+    CompositeImplicitAutograd: conv3d_padding_symint
+
+- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: conv_tbc
+  autogen: conv_tbc.out
+
+- func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)
+
+# NB: we inherit the goofy argument order from PyTorch torch.nn.functional
+- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, SymInt groups=1, SymInt[1] dilation=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv_transpose1d_symint
+
+- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv_transpose2d_symint
+
+- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt groups=1, SymInt[3] dilation=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv_transpose3d_symint
+
+- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: copy
+  tags: core
+
+- func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    MkldnnCPU: copy_mkldnn_
+    SparseCPU, SparseCUDA: copy_sparse_wrapper_
+    CompositeExplicitAutograd: copy_
+    SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_
+    NestedTensorCPU, NestedTensorCUDA: copy_nested_
+  autogen: copy.out
+
+- func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
+  dispatch:
+    MPS: _copy_from_mps
+  autogen: _copy_from.out
+
+# We need this to be able to properly copy from a CPU to an XLA tensor with different sizes.
+# See https://github.com/pytorch/xla/issues/2881
+- func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor
+  dispatch:
+    MPS: _copy_from_and_resize_mps
+  autogen: _copy_from_and_resize.out
+
+- func: cos(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cos.out
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: cos_nested
+  tags: [core, pointwise]
+
+- func: cos_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cos.out
+  tags: pointwise
+
+- func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: cos_out
+    MPS: cos_out_mps
+  tags: pointwise
+
+- func: cosh(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cosh.out
+  tags: [core, pointwise]
+
+- func: cosh_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cosh.out
+  tags: pointwise
+
+- func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: cosh_out
+    MPS: cosh_out_mps
+  tags: pointwise
+
+- func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
+
+- func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: count_nonzero_cpu
+    CUDA: count_nonzero_cuda
+    MPS: count_nonzero_mps
+  autogen: count_nonzero.dim_IntList_out
+
+- func: count_nonzero(Tensor self, int? dim=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: count_nonzero
+  autogen: count_nonzero.out
+
+- func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
+  variants: function, method
+
+- func: corrcoef(Tensor self) -> Tensor
+  variants: function, method
+
+- func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid
+  dispatch:
+    CUDA: cudnn_affine_grid_generator_forward
+  autogen: cudnn_affine_grid_generator.out
+
+# TODO: Why do I have to call this grad?!
+- func: cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta
+  dispatch:
+    CUDA: cudnn_affine_grid_generator_backward
+  autogen: cudnn_affine_grid_generator_backward.out
+
+- func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: cudnn_batch_norm
+  autogen: cudnn_batch_norm.out
+
+# NB: You can only use this if you used cudnn_batch_norm training=True
+- func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: cudnn_batch_norm_backward
+  autogen: cudnn_batch_norm_backward.out
+
+- func: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+  dispatch:
+    CUDA: cudnn_convolution
+
+- func: cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: cudnn_convolution_out
+
+- func: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+  dispatch:
+    CUDA: cudnn_convolution_transpose
+  autogen: cudnn_convolution_transpose.out
+
+- func: _mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    MPS: _mps_convolution_transpose
+  autogen: _mps_convolution_transpose.out
+
+- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask) -> (Tensor, Tensor)
+  dispatch:
+    MPS: mps_convolution_transpose_backward
+  autogen: mps_convolution_transpose_backward.out
+
+- func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CUDA: cudnn_convolution_relu
+  autogen: cudnn_convolution_relu.out
+
+- func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CUDA: cudnn_convolution_add_relu
+  autogen: cudnn_convolution_add_relu.out
+
+# NB: input is special cased in a way I don't quite understand
+- func: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output
+  dispatch:
+    CUDA: cudnn_grid_sampler_forward
+  autogen: cudnn_grid_sampler.out
+
+- func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid)
+  dispatch:
+    CUDA: cudnn_grid_sampler_backward
+  autogen: cudnn_grid_sampler_backward.out
+
+- func: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: cummax
+
+- func: cummax.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: cummax_out
+
+- func: cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+
+- func: _cummax_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
+  variants: function
+  dispatch:
+    CPU: cummax_helper_cpu
+    CUDA: cummax_helper_cuda
+
+- func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: cummin
+
+- func: cummin.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: cummin_out
+
+- func: cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+
+- func: _cummin_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
+  variants: function
+  dispatch:
+    CPU: cummin_helper_cpu
+    CUDA: cummin_helper_cuda
+
+- func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+
+- func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: cumprod.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  structured_delegate: cumprod.out
+  variants: method
+
+- func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: cumprod_out
+    MPS: cumprod_out_mps
+
+- func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  variants: method
+
+- func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: cumprod_backward(Tensor grad, Tensor input, int dim, Tensor output) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+
+- func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: cumsum.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  structured_delegate: cumsum.out
+  variants: method
+
+- func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: cumsum_out
+    MPS: cumsum_out_mps
+
+- func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  variants: method
+
+- func: cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: cumulative_trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
+
+- func: cumulative_trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor
+
+- func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
+
+# convenience function that converts to intlists for you
+- func: ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
+
+- func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
+  dispatch:
+    CPU: ctc_loss_cpu
+    CUDA: ctc_loss_gpu
+    Meta: ctc_loss_meta
+  autogen: _ctc_loss.out
+  tags: dynamic_output_shape  # the shape of second output is data dependent
+
+- func: _ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: ctc_loss_tensor
+  autogen: _ctc_loss.Tensor_out
+  tags: dynamic_output_shape  # the shape of second output is data dependent
+
+- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
+  dispatch:
+    CPU: ctc_loss_backward_cpu
+    CUDA: ctc_loss_backward_gpu
+  autogen: _ctc_loss_backward.out
+
+- func: _ctc_loss_backward.Tensor(Tensor grad, Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
+  dispatch:
+    CPU, CUDA: ctc_loss_backward_tensor
+
+- func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: diag_embed
+  autogen: diag_embed.out
+
+- func: diagflat(Tensor self, int offset=0) -> Tensor
+  variants: function, method
+
+- func: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: diagonal
+  tags: core
+
+- func: linalg_diagonal(Tensor(a) A, *, int offset=0, int dim1=-2, int dim2=-1) -> Tensor(a)
+  python_module: linalg
+  variants: function
+
+- func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a)
+  variants: function, method
+
+- func: diagonal_backward(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: diagonal_backward_symint
+  autogen: diagonal_backward.out
+
+- func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)
+  variants: method
+
+- func: diff(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None) -> Tensor
+  variants: function, method
+
+- func: diff.out(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+
+- func: gradient.scalarint(Tensor self, *, Scalar? spacing=None, int? dim=None, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.scalararray(Tensor self, *, Scalar spacing, int[] dim, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.array(Tensor self, *, int[] dim, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.scalarrayint(Tensor self, *, Scalar[] spacing, int? dim=None, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.scalarrayarray(Tensor self, *, Scalar[] spacing, int[] dim, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.tensorarrayint(Tensor self, *, Tensor[] spacing, int? dim=None, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.tensorarray(Tensor self, *, Tensor[] spacing, int[] dim, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: div.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: div.out
+  dispatch:
+    SparseCPU, SparseCUDA: div_sparse
+    ZeroTensor: div_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor
+  tags: [core, pointwise]
+
+- func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: div.out
+  dispatch:
+    SparseCPU, SparseCUDA: div_sparse_
+  tags: pointwise
+
+- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: div_out
+    MPS: div_out_mps
+    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+  tags: pointwise
+
+- func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: div.out_mode
+  dispatch:
+    SparseCPU, SparseCUDA: div_sparse
+  tags: [core, pointwise]
+
+- func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: div.out_mode
+  dispatch:
+    SparseCPU, SparseCUDA: div_sparse_
+  tags: pointwise
+
+- func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: div_out_mode
+    MPS: div_out_mode_mps
+    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+  tags: pointwise
+
+# For C++ only, until we have conversion from C++ numbers to Tensor
+- func: div.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: div
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
+  tags: [core, pointwise]
+
+- func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: div_
+  autogen: div.Scalar_out
+  tags: pointwise
+
+- func: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: div
+  tags: [core, pointwise]
+
+- func: div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: div_
+  autogen: div.Scalar_mode_out
+  tags: pointwise
+
+# divide, alias for div
+- func: divide.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: divide.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+
+- func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: divide.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+  variants: function, method
+
+- func: divide_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
+  variants: method
+
+- func: divide.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+
+- func: divide.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+  variants: function, method
+
+- func: divide_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
+  variants: method
+
+  # true_divide, an alias for div
+- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: pointwise
+
+- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: dot(Tensor self, Tensor tensor) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: dot
+    CUDA: dot_cuda
+    MPS: dot_mps
+
+- func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: dot_out
+
+- func: vdot(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: vdot
+    CUDA: vdot_cuda
+
+- func: vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: vdot_out
+
+- func: einsum(str equation, Tensor[] tensors, *, int[]? path=None) -> Tensor
+
+- func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: embedding_symint
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
+  autogen: embedding.out
+  tags: core
+
+- func: embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: embedding_backward_symint
+
+- func: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor
+  dispatch:
+    CPU: embedding_dense_backward_cpu
+    CUDA: embedding_dense_backward_cuda
+    MPS: embedding_dense_backward_mps
+  autogen: embedding_dense_backward.out
+  tags: core
+
+- func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
+  dispatch:
+    CPU: embedding_renorm_cpu_
+    CUDA: embedding_renorm_cuda_
+  autogen: embedding_renorm, embedding_renorm.out
+
+- func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
+
+# NOTE [ embedding_bag Native Functions ]
+# The `_embedding_bag.*` variants assume that input tensors except for `weight`,
+# e.g. `indices` and `offsets` (and `offset2bag`), are contiguous.
+# We really only need to enforce this for `_embedding_bag` (the forward) because
+# the backward inputs are the same as forward ones.
+# The above `embedding_bag` wrapper is created to achieve this, e.g.,
+# applying indices = indices.contiguous().
+# The backward functions apply a check that these input tensors are contiguous.
+
+
+- func: _embedding_bag_forward_only(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _embedding_bag_forward_only_cpu
+    CUDA: _embedding_bag_forward_only_cuda
+  autogen: _embedding_bag_forward_only.out
+
+- func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
+
+# row_stack is the alias of vstack
+- func: row_stack(Tensor[] tensors) -> Tensor
+
+- func: row_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)
+
+# To keep backward and forward compatibility, and to avoid ambiguity with the
+# original signature above, scale_grad_by_freq, mode, sparse,
+# per_sample_weights, and include_last_offset parameters do not have default
+# values. Once the original signature is removed, default values can be added.
+- func: embedding_bag.padding_idx(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, bool include_last_offset, int? padding_idx) -> (Tensor, Tensor, Tensor, Tensor)
+
+- func: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _embedding_bag_cpu
+    CUDA: _embedding_bag_cuda
+  autogen: _embedding_bag.out
+  tags: core
+
+- func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _embedding_bag_backward_symint
+
+- func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _embedding_bag_sparse_backward_symint
+
+- func: _embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+  dispatch:
+    CPU: _embedding_bag_dense_backward_cpu
+    CUDA: _embedding_bag_dense_backward_cuda
+  autogen: _embedding_bag_dense_backward.out
+
+- func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor
+  dispatch:
+    CPU: _embedding_bag_per_sample_weights_backward_cpu
+    CUDA: _embedding_bag_per_sample_weights_backward_cuda
+  autogen: _embedding_bag_per_sample_weights_backward.out
+
+- func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: empty_names
+  autogen: empty.names_out
+
+- func: empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    CPU: empty_cpu
+    CUDA: empty_cuda
+    MPS: empty_mps
+    Meta: empty_meta_symint
+    MkldnnCPU: empty_mkldnn
+    SparseCPU, SparseCUDA, SparseMeta: empty_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_sparse_compressed
+    QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
+  tags: core
+
+- func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: empty_permuted_symint
+  autogen: empty_permuted.out
+
+# We do not make new_empty a composite that calls into new_empty_strided, as the strided version
+# is significantly more difficult to implement by different backends
+- func: new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: new_empty_symint
+  autogen: new_empty.out
+
+- func: new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: new_empty_strided_symint
+  autogen: new_empty_strided.out
+
+- func: new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: new_full
+  autogen: new_full.out
+
+- func: new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: new_zeros
+  autogen: new_zeros.out
+
+- func: new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: new_ones
+  autogen: new_ones.out
+
+# other overrides are to provide a more helpful error message that dtype is required
+- func: _empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
+  dispatch:
+    CPU: empty_affine_quantized_other_backends_stub
+    QuantizedCPU, QuantizedCUDA: empty_affine_quantized
+  autogen: _empty_affine_quantized.out
+
+# it's a factory function receiving a tensor argument, thus overriding explicitly
+# other overrides are to provide a more helpful error message that dtype is required
+- func: _empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+  category_override: factory
+  dispatch:
+    CPU: empty_per_channel_affine_quantized_other_backends_stub
+    QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized
+  autogen: _empty_per_channel_affine_quantized.out
+
+- func: resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: [core, inplace_view]
+  dispatch:
+    Meta: resize__symint
+    CPU: resize_
+    CUDA: resize_cuda_
+    MPS: resize_mps_
+    QuantizedCPU: quantized_resize_cpu_
+    SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_
+  autogen: resize, resize.out
+
+# This is a utility function to enable users to resize out tensor while registering kernels for out variants.
+# Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration
+# to make it easy to register out variants for ops.
+- func: _resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: function
+  dispatch:
+    Meta: _resize_output_
+  autogen: _resize_output, _resize_output.out
+
+- func: empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  category_override: factory
+  variants: function
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: empty_quantized
+  autogen: empty_quantized.out
+
+- func: empty.out(SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  device_guard: False
+
+- func: empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: empty_like
+    QuantizedCPU, QuantizedCUDA: empty_like_quantized
+    SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: empty_like_nested
+  autogen: empty_like.out
+
+- func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CPU: empty_strided_cpu
+    CUDA: empty_strided_cuda
+    MPS: empty_strided_mps
+    Meta: empty_strided_meta_symint
+    QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized
+  autogen: empty_strided.out
+  tags: core
+
+- func: erf(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erf.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: erf_sparse
+    SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr
+  tags: [core, pointwise]
+
+- func: erf_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erf.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: erf_sparse_
+    SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_
+  tags: pointwise
+
+- func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: erf_out
+    MPS: erf_out_mps
+    SparseCPU, SparseCUDA: erf_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out
+  tags: pointwise
+
+- func: erfc(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfc.out
+  variants: function, method
+  tags: pointwise
+
+- func: erfc_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfc.out
+  variants: function, method
+  tags: pointwise
+
+- func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: erfc_out
+  tags: pointwise
+
+- func: exp(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: exp.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: exp_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: exp.out
+  variants: function, method
+  tags: pointwise
+
+- func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: exp_out
+    MPS: exp_out_mps
+  tags: pointwise
+
+- func: exp2(Tensor self) -> Tensor
+  structured_delegate: exp2.out
+  variants: function, method
+  tags: pointwise
+
+- func: exp2_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: exp2.out
+  variants: function, method
+  tags: pointwise
+
+- func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: exp2_out
+    MPS: exp2_out_mps
+  tags: pointwise
+
+- func: expm1(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: expm1.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: expm1_sparse
+    SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr
+  tags: [core, pointwise]
+
+- func: expm1_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: expm1.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: expm1_sparse_
+    SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_
+  tags: pointwise
+
+- func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: expm1_out
+    MPS: expm1_out_mps
+    SparseCPU, SparseCUDA: expm1_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out
+  tags: pointwise
+
+- func: expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: expand
+  tags: core
+
+- func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a)
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+  device_check: NoCheck
+  device_guard: False
+
+# decomposes to eye.m
+- func: eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: eye
+
+- func: eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: eye
+
+- func: eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: eye_out_cpu
+    CUDA: eye_out_cuda
+    MPS: eye_out_mps
+
+- func: eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: eye_out_cpu
+    CUDA: eye_out_cuda
+    MPS: eye_out_mps
+
+- func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
+  variants: function, method
+
+- func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a)
+  variants: function, method
+
+- func: flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a)
+  variants: function, method
+
+- func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
+  variants: function, method
+
+- func: unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: unflatten_symint
+
+- func: unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: unflatten_dimname_symint
+
+- func: fill.Scalar(Tensor self, Scalar value) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fill
+  tags: core
+
+- func: fill.Tensor(Tensor self, Tensor value) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fill
+
+- func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: fill_
+    MPS: fill_scalar_mps
+    QuantizedCPU, QuantizedCUDA: fill_quantized_
+    Meta: fill_meta_
+    SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: fill_nested_
+  autogen: fill.Scalar_out
+
+- func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: fill_
+    MPS: fill_tensor_mps_
+    QuantizedCPU, QuantizedCUDA: fill_quantized_
+    Meta: fill_meta_
+    NestedTensorCPU, NestedTensorCUDA: fill_nested_
+  autogen: fill.Tensor_out
+
+- func: floor(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: floor.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: floor_sparse
+    SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr
+  tags: [core, pointwise]
+
+- func: floor_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: floor.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: floor_sparse_
+    SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_
+  tags: pointwise
+
+- func: floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: floor_out
+    MPS: floor_out_mps
+    SparseCPU, SparseCUDA: floor_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out
+  tags: pointwise
+
+- func: floor_divide(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: floor_divide
+    MPS: floor_divide_mps
+    SparseCPU, SparseCUDA: floor_divide_sparse
+
+- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: floor_divide_
+    MPS: floor_divide_mps_
+    SparseCPU, SparseCUDA: floor_divide_sparse_
+
+- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: floor_divide_out
+    MPS: floor_divide_out_mps
+    SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
+
+- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: floor_divide
+
+- func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: floor_divide_
+  autogen: floor_divide.Scalar_out
+
+- func: frac(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: frac.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: frac_sparse
+    SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr
+  tags: pointwise
+
+- func: frac_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: frac.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: frac_sparse_
+    SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_
+  tags: pointwise
+
+- func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: frac_out
+    MPS: frac_out_mps
+    SparseCPU, SparseCUDA: frac_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_out
+  tags: pointwise
+
+- func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: full
+  autogen: full.names_out
+
+- func: full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: full
+  tags: core
+
+- func: full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: full_out
+
+- func: full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: full_like
+  autogen: full_like.out
+
+- func: from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CPU: from_file
+  autogen: from_file.out
+
+- func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: gcd_out
+  tags: pointwise
+
+- func: gcd(Tensor self, Tensor other) -> Tensor
+  structured_delegate: gcd.out
+  variants: function, method
+  tags: pointwise
+
+- func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: gcd.out
+  variants: function, method
+
+- func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: lcm_out
+  tags: pointwise
+
+- func: lcm(Tensor self, Tensor other) -> Tensor
+  structured_delegate: lcm.out
+  variants: function, method
+  tags: pointwise
+
+- func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: lcm.out
+  variants: function, method
+
+# NOTE [ grid_sampler Native Functions ]
+# `grid_sampler` is _supposed to_ do all the shape checking and then dispatch to
+# one of `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of
+# which has the corresponding backward defined as native functions as well.
+# However, we do shape checking everywhere for now since each of the mentioned
+# functions can be called directly, which will lead to crashes otherwise.
+# See https://github.com/pytorch/pytorch/issues/73187 for more information.
+#
+# There is also _grid_sampler_2d_backward_cpu_fallback which is an
+# implementation detail of grid_sampler_2d and is only exposed here for testing
+# purposes.
+#
+# Additionally, arguments `padding_mode` and `interpolation_mode` are cast to
+# enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in
+# `interpolation_mode` because it only supports Bilinear interpolation mode.
+# Nor does it take in `align_corners` because it only supports the mode
+# `align_corners = True`.
+- func: grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+
+- func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  dispatch:
+    CPU, QuantizedCPU: grid_sampler_2d_cpu
+    CUDA: grid_sampler_2d_cuda
+    MPS: grid_sampler_2d_mps
+  autogen: grid_sampler_2d.out
+  tags: core
+
+# `grid_sampler_2d_backward` takes in `output_mask` to optimize performance for
+# the case where `input` doesn't require gradient. Gradient for `grid` is always
+# computed (only `output_mask[0]` is checked by the implementations).
+- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)
+  dispatch:
+    CPU: grid_sampler_2d_backward_cpu
+    CUDA: grid_sampler_2d_backward_cuda
+  autogen: grid_sampler_2d_backward.out
+
+# See NOTE [ grid_sample CPU fallback ]
+- func: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _grid_sampler_2d_cpu_fallback
+  autogen: _grid_sampler_2d_cpu_fallback.out
+
+- func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
+
+- func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  dispatch:
+    CPU: grid_sampler_3d_cpu
+    CUDA: grid_sampler_3d_cuda
+  autogen: grid_sampler_3d.out
+
+# `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
+# the case where `input` doesn't require gradient. Gradient for `grid` is always
+# computed (only `output_mask[0]` is checked by the implementations).
+- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)
+  dispatch:
+    CPU: grid_sampler_3d_backward_cpu
+    CUDA: grid_sampler_3d_backward_cuda
+  autogen: grid_sampler_3d_backward.out
+
+- func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hann_window
+  autogen: hann_window.out
+
+- func: hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hann_window
+  autogen: hann_window.periodic_out
+
+- func: hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hamming_window
+  autogen: hamming_window.out
+
+- func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hamming_window
+  autogen: hamming_window.periodic_out
+
+- func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hamming_window
+  autogen: hamming_window.periodic_alpha_out
+
+- func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hamming_window
+  autogen: hamming_window.periodic_alpha_beta_out
+
+- func: kaiser_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: kaiser_window
+  autogen: kaiser_window.out
+
+- func: kaiser_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: kaiser_window
+  autogen: kaiser_window.periodic_out
+
+- func: kaiser_window.beta(int window_length, bool periodic, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: kaiser_window
+  autogen: kaiser_window.beta_out
+
+- func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
+
+- func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+
+- func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: native_group_norm
+    CompositeExplicitAutograd: math_group_norm
+  autogen: native_group_norm.out
+  tags: core
+
+- func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: native_group_norm_backward
+  autogen: native_group_norm_backward.out
+  tags: core
+
+# Real to complex forward FFT
+- func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _fft_r2c_mkl
+    CUDA: _fft_r2c_cufft
+    MPS: _fft_r2c_mps
+
+- func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: _fft_r2c_mkl_out
+    CUDA: _fft_r2c_cufft_out
+    MPS: _fft_r2c_mps_out
+
+# Complex to real inverse FFT
+- func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _fft_c2r_mkl
+    CUDA: _fft_c2r_cufft
+    MPS: _fft_c2r_mps
+
+- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: _fft_c2r_mkl_out
+    CUDA: _fft_c2r_cufft_out
+    MPS: _fft_c2r_mps_out
+
+# Standard complex to complex FFT (forward or backward)
+- func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _fft_c2c_mkl
+    CUDA: _fft_c2c_cufft
+    MPS: _fft_c2c_mps
+
+- func: _fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: _fft_c2c_mkl_out
+    CUDA: _fft_c2c_cufft_out
+    MPS: _fft_c2c_mps_out
+
+- func: _validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: _validate_compressed_sparse_indices_cpu
+    CUDA: _validate_compressed_sparse_indices_cuda
+
+- func: _cufft_get_plan_cache_size(DeviceIndex device_index) -> int
+
+- func: _cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int
+
+- func: _cufft_set_plan_cache_max_size(DeviceIndex device_index, int max_size) -> ()
+
+- func: _cufft_clear_plan_cache(DeviceIndex device_index) -> ()
+
+- func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: index.Tensor_out
+  variants: function, method
+  dispatch:
+    QuantizedCPU: quantized_index
+  tags: [core, dynamic_output_shape]
+  # NB: This function is special-cased in tools/autograd/gen_variable_type.py
+  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
+  # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
+  # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices)
+
+- func: index.Tensor_out(Tensor self, Tensor?[] indices, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  structured: True
+  structured_inherits: TensorIteratorBase
+  precomputed:
+  - indices -> DimVector sizes, DimVector strides
+  dispatch:
+    CPU, CUDA, MPS: index_out
+
+# Used by inductor to signal indexing without bounds checks
+# Note that we don't support boolean indexing, to avoid dynamic output shapes
+- func: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _unsafe_index
+
+- func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU, CUDA: index_copy_out
+
+- func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
+  variants: method
+  structured_delegate: index_copy.out
+
+- func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
+  variants: function, method
+  structured_delegate: index_copy.out
+
+- func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)
+  variants: method
+
+- func: index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor
+  variants: function, method
+
+- func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)
+  device_check: NoCheck   # delegate to _index_put_impl_, which leverages TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: index_put_
+  autogen: index_put.out
+  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
+  # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Tensor const & rhs)
+  # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Scalar v)
+  # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Tensor const & rhs)
+  # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Scalar v)
+
+- func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+  device_check: NoCheck   # delegate to _index_put_impl_ after clone, which leverages TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: index_put
+  tags: core
+
+- func: _unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+  device_check: NoCheck   # delegate to _index_put_impl_ after clone, which leverages TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _unsafe_index_put
+
+- func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CPU, CUDA, MPS: _index_put_impl_
+    QuantizedCPU: _index_put_impl_quantized_cpu_
+    QuantizedCUDA: _index_put_impl_quantized_cuda_
+  autogen: _index_put_impl, _index_put_impl.out
+
+- func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
+  variants: function
+
+- func: isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor
+  variants: function, method
+
+- func: isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: isin_Tensor_Tensor_out
+
+- func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
+  variants: function
+  structured_delegate: isin.Tensor_Tensor_out
+
+- func: isin.Tensor_Scalar_out(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: isin_Tensor_Scalar_out
+
+- func: isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor
+  variants: function
+  structured_delegate: isin.Tensor_Scalar_out
+
+- func: isin.Scalar_Tensor_out(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: isin_Scalar_Tensor_out
+
+- func: isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
+  variants: function
+  structured_delegate: isin.Scalar_Tensor_out
+
+- func: isnan(Tensor self) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, MPS: isnan
+    SparseCPU, SparseCUDA: isnan_sparse
+    SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr
+  autogen: isnan.out
+  tags: [core, pointwise]
+
+- func: is_distributed(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: is_floating_point(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: is_complex(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: is_conj(Tensor self) -> bool
+  variants: function, method
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: _is_zerotensor(Tensor self) -> bool
+  variants: function, method
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: is_neg(Tensor self) -> bool
+  variants: function, method
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: isreal(Tensor self) -> Tensor
+  variants: function, method
+
+- func: is_nonzero(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: is_same_size(Tensor self, Tensor other) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: nested_is_same_size
+    CompositeExplicitAutograd: is_same_size
+
+- func: is_signed(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: is_inference(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor
+
+- func: kron(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: kthvalue
+
+- func: kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU: kthvalue_out_cpu
+    CUDA: kthvalue_out_cuda
+
+- func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: layer_norm_symint
+
+- func: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: layer_norm_cpu
+    CUDA: layer_norm_cuda
+    MPS: layer_norm_mps
+    CompositeExplicitAutograd: math_native_layer_norm
+    NestedTensorCPU, NestedTensorCUDA: nested_layer_norm
+  autogen: native_layer_norm.out
+  tags: core
+
+- func: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: layer_norm_backward_cpu
+    CUDA: layer_norm_backward_cuda
+    MPS: layer_norm_backward_mps
+    NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
+  autogen: native_layer_norm_backward.out
+  tags: core
+
+- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: nan_to_num
+    SparseCPU, SparseCUDA: nan_to_num_sparse
+  tags: pointwise
+
+- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: nan_to_num_
+    SparseCPU, SparseCUDA: nan_to_num_sparse_
+  tags: pointwise
+
+- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nan_to_num_out
+    MPS: nan_to_num_out_mps
+    SparseCPU, SparseCUDA: nan_to_num_sparse_out
+  tags: pointwise
+
+- func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: linear
+    NestedTensorCPU, NestedTensorCUDA: nested_linear
+    MPS: _mps_linear
+
+- func: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: nested_linear_backward
+    MPS: mps_linear_backward
+  autogen: linear_backward.out
+
+- func: linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: linear_out
+
+- func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_linear
+  autogen: mkldnn_linear.out
+
+- func: mkldnn_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_linear_backward_input
+  autogen: mkldnn_linear_backward_input.out
+
+- func: mkldnn_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor)
+  dispatch:
+    MkldnnCPU: mkldnn_linear_backward_weights
+  autogen: mkldnn_linear_backward_weights.out
+
+- func: mkldnn_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    MkldnnCPU: mkldnn_linear_backward
+  autogen: mkldnn_linear_backward.out
+
+- func: _cslt_compress(Tensor input) -> Tensor
+  dispatch:
+    CUDA: _cslt_compress
+
+- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor
+  dispatch:
+    CUDA: _cslt_sparse_mm
+
+- func: _cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int
+  dispatch:
+    CUDA: _cslt_sparse_mm_search
+
+- func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor
+  dispatch:
+    CUDA: _sparse_semi_structured_linear
+
+- func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor
+  dispatch:
+    CUDA: _mixed_dtypes_linear
+
+- func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
+
+- func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
+
+- func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int)
+
+- func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor
+
+- func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
+
+- func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
+
+- func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor
+
+- func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
+
+- func: ldexp.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: function, method
+  tags: pointwise
+
+- func: ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
+
+- func: linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: linspace
+
+- func: linspace.Tensor_Tensor(Tensor start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace
+
+- func: linspace.Tensor_Scalar(Tensor start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace
+
+- func: linspace.Scalar_Tensor(Scalar start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace
+
+- func: linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: linspace_out
+    CUDA: linspace_cuda_out
+    MPS: linspace_out_mps
+
+- func: linspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace_out
+
+- func: linspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace_out
+
+- func: linspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace_out
+
+- func: log(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: log_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log.out
+  variants: function, method
+  tags: pointwise
+
+- func: log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: log_out
+    MPS: log_out_mps
+  tags: pointwise
+
+- func: log10(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log10.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: log10_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log10.out
+  variants: function, method
+  tags: pointwise
+
+- func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: log10_out
+    MPS: log10_out_mps
+  tags: pointwise
+
+- func: log1p(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log1p.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: log1p_sparse
+    SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr
+  tags: [core, pointwise]
+
+- func: log1p_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log1p.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: log1p_sparse_
+    SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_
+  tags: pointwise
+
+- func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: log1p_out
+    MPS: log1p_out_mps
+    SparseCPU, SparseCUDA: log1p_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out
+  tags: pointwise
+
+- func: log2(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log2.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: log2_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log2.out
+  variants: function, method
+  tags: pointwise
+
+- func: log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: log2_out
+    MPS: log2_out_mps
+  tags: pointwise
+
+- func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: logaddexp_out
+    MPS: logaddexp_out_mps
+  tags: pointwise
+
+- func: logaddexp(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+  structured_delegate: logaddexp.out
+  tags: pointwise
+
+- func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: logaddexp2_out
+    MPS: logaddexp2_out_mps
+  tags: pointwise
+
+- func: logaddexp2(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+  structured_delegate: logaddexp2.out
+  tags: pointwise
+
+- func: xlogy.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: xlogy.OutTensor
+  variants: function, method
+  tags: pointwise
+
+- func: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: xlogy
+  tags: pointwise
+
+- func: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: xlogy
+  tags: pointwise
+
+# xlogy: inplace variant
+- func: xlogy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: xlogy.OutTensor
+  tags: pointwise
+
+- func: xlogy_.Scalar_Other(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: xlogy_
+
+# xlogy: out variant
+- func: xlogy.OutTensor(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    CPU, CUDA: xlogy_out
+    MPS: xlogy_out_mps
+  tags: pointwise
+
+- func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: xlogy_out
+  tags: pointwise
+
+- func: xlogy.OutScalar_Other(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: xlogy_out
+  tags: pointwise
+
+- func: logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: logspace
+
+- func: logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace
+
+- func: logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace
+
+- func: logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace
+
+- func: logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: logspace_out
+    CUDA: logspace_cuda_out
+
+- func: logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace_out
+
+- func: logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace_out
+
+- func: logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace_out
+
+# log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
+- func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+
+- func: log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: log_softmax_out
+
+- func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+
+- func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  structured_delegate: _log_softmax.out
+  tags: core
+
+- func: _log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: log_softmax_cpu_out
+    CUDA: log_softmax_cuda_out
+    MPS: log_softmax_mps_out
+
+- func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
+  structured_delegate: _log_softmax_backward_data.out
+
+- func: _log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: log_softmax_backward_cpu_out
+    CUDA: log_softmax_backward_cuda_out
+    MPS: log_softmax_backward_mps_out
+
+- func: _logcumsumexp(Tensor self, int dim) -> Tensor
+  dispatch:
+    CPU: _logcumsumexp_cpu
+    CUDA: _logcumsumexp_cuda
+
+- func: _logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: _logcumsumexp_out_cpu
+    CUDA: _logcumsumexp_out_cuda
+
+- func: logcumsumexp(Tensor self, int dim) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logcumsumexp
+
+- func: logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: logcumsumexp_out
+
+- func: logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor
+  variants: function, method
+
+- func: logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logsumexp
+
+- func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    # calls squeeze
+    CompositeExplicitAutogradNonFunctional: logsumexp_out
+
+- func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
+
+- func: matmul(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: matmul
+    NestedTensorCPU, NestedTensorCUDA: matmul_nested
+
+- func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: matmul_backward_nested
+  autogen: matmul_backward.out
+
+- func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeImplicitAutograd: matmul_out
+    NestedTensorCPU, NestedTensorCUDA: matmul_out_nested
+
+# Alias to linalg.matrix_power
+- func: matrix_power(Tensor self, int n) -> Tensor
+  variants: function, method
+
+# Alias to linalg.matrix_power
+- func: matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)
+
+# Alias to linalg.matrix_exp
+- func: matrix_exp(Tensor self) -> Tensor
+  variants: function, method
+
+# This function should be deprecated in favor of differential_analytic_matrix_function in FunctionsManual.cpp
+- func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor
+
+# DEPRECATED: Use torch.aminmax instead
+- func: _aminmax(Tensor self) -> (Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: _aminmax_all
+  autogen: _aminmax.out
+
+# DEPRECATED: Use torch.aminmax instead
+- func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: _aminmax
+  autogen: _aminmax.dim_out
+
+- func: aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: aminmax.out
+  variants: function, method
+
+- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: aminmax_out
+    MPS: aminmax_out_mps
+
+- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
+  dispatch:
+    CPU, CUDA: _compute_linear_combination
+
+- func: _compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: _compute_linear_combination_out
+
+- func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: max.dim_max
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: qmax
+  tags: core
+
+- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU, CUDA: max_out
+    MPS: max_out_mps
+
+- func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+
+- func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: value_selecting_reduction_backward_symint
+
+- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+  variants: function, method
+  structured_delegate: amax.out
+  tags: core
+
+- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU, CUDA: amax_out
+    MPS: amax_out_mps
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+
+- func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+
+- func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: max_pool2d
+    MPS: mps_max_pool2d
+
+- func: max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MPS: mps_max_pool2d_backward
+  autogen: max_pool2d_backward.out
+
+- func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_max_pool2d
+  autogen: mkldnn_max_pool2d.out
+
+- func: mkldnn_max_pool2d_backward(Tensor grad_output, Tensor output, Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_max_pool2d_backward
+  autogen: mkldnn_max_pool2d_backward.out
+
+- func: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_max_pool3d
+  autogen: mkldnn_max_pool3d.out
+
+- func: mkldnn_max_pool3d_backward(Tensor grad_output, Tensor output, Tensor input, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_max_pool3d_backward
+  autogen: mkldnn_max_pool3d_backward.out
+
+- func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    QuantizedCPU: quantized_max_pool1d
+  autogen: quantized_max_pool1d.out
+
+- func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    QuantizedCPU: quantized_max_pool2d
+    QuantizedCUDA: quantized_max_pool2d_cudnn
+  autogen: quantized_max_pool2d.out
+
+- func: quantized_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    QuantizedCPU: quantized_max_pool3d
+  autogen: quantized_max_pool3d.out
+
+- func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+
+# The CPU and GPU dispatch variants are named weirdly here because otherwise there
+# are namespacing issues in C++
+- func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: mean
+  tags: core
+
+# For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
+# FIXME: fix CI jobs and re-enable this
+#- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+#  device_check: NoCheck   # TensorIterator
+#  dispatch:
+#    CompositeExplicitAutograd: mean_dtype_out
+
+- func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: mean.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    QuantizedCPU: mean_quantized_cpu
+  tags: core
+
+- func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: mean_out
+    MPS: mean_out_mps
+    QuantizedCPU: mean_out_quantized_cpu
+
+- func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # Composite
+  variants: function, method
+
+- func: nanmean.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # Composite
+
+- func: median(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: median_cpu
+    CUDA: median_cuda
+    MPS: median_mps
+  autogen: median.out
+
+- func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: median
+
+- func: median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU: median_out_cpu
+    CUDA: median_out_cuda
+    MPS: median_out_mps
+
+- func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: nanmedian(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: nanmedian_cpu
+    CUDA: nanmedian_cuda
+  autogen: nanmedian.out
+
+- func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: nanmedian
+
+- func: nanmedian.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU: nanmedian_out_cpu
+    CUDA: nanmedian_out_cuda
+
+- func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: min.dim_min
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: qmin
+  tags: core
+
+- func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU, CUDA: min_out
+    MPS: min_out_mps
+
+- func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+
+- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+  variants: function, method
+  structured_delegate: amin.out
+  tags: core
+
+- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU, CUDA: amin_out
+    MPS: amin_out_mps
+
+# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
+# native_functions.yaml
+# https://github.com/pytorch/pytorch/issues/77394
+- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    MPS: _mps_convolution
+  autogen: _mps_convolution.out
+
+- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    MPS: mps_convolution_backward
+  autogen: mps_convolution_backward.out
+
+- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: mkldnn_convolution
+  autogen: mkldnn_convolution.out
+
+- func: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: mkldnn_rnn_layer
+    MkldnnCPU: mkldnn_rnn_layer
+  autogen: mkldnn_rnn_layer.out
+
+- func: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: mkldnn_rnn_layer_backward
+  autogen: mkldnn_rnn_layer_backward.out
+
+- func: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: miopen_batch_norm
+  autogen: miopen_batch_norm.out
+
+- func: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: miopen_batch_norm_backward
+  autogen: miopen_batch_norm_backward.out
+
+- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+  dispatch:
+    CUDA: miopen_convolution
+  autogen: miopen_convolution.out
+
+- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+  dispatch:
+    CUDA: miopen_convolution_transpose
+  autogen: miopen_convolution_transpose.out
+
+- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+  dispatch:
+    CUDA: miopen_depthwise_convolution
+  autogen: miopen_depthwise_convolution.out
+
+- func: miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CUDA: miopen_convolution_relu
+
+- func: miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CUDA: miopen_convolution_add_relu
+
+- func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: miopen_rnn
+  autogen: miopen_rnn.out
+  tags: nondeterministic_seeded
+
+
+- func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+  dispatch:
+    CUDA: miopen_rnn_backward
+  autogen: miopen_rnn_backward.out
+
+- func: mm(Tensor self, Tensor mat2) -> Tensor
+  structured_delegate: mm.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: _sparse_mm
+    SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm
+  tags: core
+
+- func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: mm_out_cpu
+    CUDA: mm_out_cuda
+    MPS: mm_out_mps
+    SparseCPU, SparseCUDA: _sparse_mm_out
+    SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
+
+- func: _int_mm(Tensor self, Tensor mat2) -> Tensor
+  dispatch:
+    CUDA: _int_mm_cuda
+
+- func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: _int_mm_out_cuda
+
+- func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
+  dispatch:
+    CPU: _convert_weight_to_int4pack_cpu
+    CUDA: _convert_weight_to_int4pack_cuda
+
+- func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
+  dispatch:
+    CPU: _weight_int4pack_mm_cpu
+    CUDA: _weight_int4pack_mm_cuda
+
+- func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
+  dispatch:
+    CPU: _weight_int8pack_mm_cpu
+
+- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
+  python_module: sparse
+
+- func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor
+  python_module: sparse
+
+- func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
+  dispatch:
+    SparseCPU: sparse_sparse_matmul_cpu
+    SparseCUDA: sparse_sparse_matmul_cuda
+  autogen: _sparse_sparse_matmul.out
+
+- func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+  dispatch:
+    CPU, CUDA: mode
+
+- func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CompositeExplicitAutograd: mode_out
+
+- func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: mul.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mul.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: mul_sparse
+    SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr
+    MkldnnCPU: mkldnn_mul
+    ZeroTensor: mul_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
+  tags: [core, pointwise]
+
+- func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mul.out
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: mul_sparse_
+    SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_
+    MkldnnCPU: mkldnn_mul_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
+  tags: pointwise
+
+- func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: mul_out
+    MPS: mul_out_mps
+    SparseCPU: mul_out_sparse_cpu
+    SparseCUDA: mul_out_sparse_cuda
+    SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr
+    MkldnnCPU: mkldnn_mul_out
+  tags: pointwise
+  # For C++ only, until we have conversion from C++ numbers to Tensor
+
+- func: mul.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: mul
+    SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar
+  tags: [core, pointwise]
+
+- func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: mul_
+    SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar
+  autogen: mul.Scalar_out
+  tags: pointwise
+# multiply, alias for mul
+
+- func: multiply.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: multiply.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+
+- func: multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: mv(Tensor self, Tensor vec) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: mv
+    SparseCPU, SparseCUDA: mv_sparse
+
+- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: mv_out
+
+- func: mvlgamma.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: mvlgamma_out
+  tags: pointwise
+
+- func: mvlgamma(Tensor self, int p) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: mvlgamma
+  tags: pointwise
+
+- func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: mvlgamma_
+  tags: pointwise
+
+- func: narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: narrow_copy_dense_cpu
+    SparseCPU, SparseCUDA: narrow_copy_sparse
+    CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
+  tags: view_copy
+
+- func: narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: narrow_copy_dense_cpu_out
+
+- func: narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: narrow_symint
+    NestedTensorCPU, NestedTensorCUDA: narrow_nested_symint
+
+- func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: narrow_tensor_symint
+
+- func: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: batch_norm_cpu
+    CUDA: batch_norm_cuda
+    MPS: batch_norm_mps
+    MkldnnCPU: mkldnn_batch_norm
+
+- func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  dispatch:
+    CUDA: batch_norm_cuda_out
+    MPS: batch_norm_mps_out
+    CPU: batch_norm_cpu_out
+
+# TODO: In 2 weeks, we should make native_batch_norm composite implicit so that this correct schema percolates correctly through our dispatching
+- func: _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _batch_norm_legit_cpu
+    CUDA: _batch_norm_legit_cuda
+    MPS: _batch_norm_legit_mps
+    MkldnnCPU: _mkldnn_batch_norm_legit
+  autogen: _native_batch_norm_legit_functional
+  tags: core
+
+# HACK: identical to _native_batch_norm_legit, but training is known to be False,
+# So we known that running stats will not be mutated.
+# The real fix here is batch norm consolidation.
+- func: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CompositeExplicitAutograd: _batch_norm_legit_no_training
+  autogen: _native_batch_norm_legit_no_training.out
+  tags: core
+
+- func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))
+  dispatch:
+    CPU: _batch_norm_legit_cpu_out
+    CUDA: _batch_norm_legit_cuda_out
+    MPS: _batch_norm_legit_mps_out
+
+- func: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _batch_norm_legit_no_stats_cpu
+    CUDA: _batch_norm_legit_no_stats_cuda
+    MPS: _batch_norm_legit_no_stats_mps
+    MkldnnCPU: _mkldnn_batch_norm_legit_no_stats
+  tags: core
+
+- func: _native_batch_norm_legit.no_stats_out(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  dispatch:
+    CPU: _batch_norm_legit_no_stats_cpu_out
+    CUDA: _batch_norm_legit_no_stats_cuda_out
+    MPS: _batch_norm_legit_no_stats_mps_out
+
+- func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: batch_norm_stats_cuda
+  autogen: batch_norm_stats.out
+
+- func: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
+  dispatch:
+    CUDA: batch_norm_elemt_cuda
+
+- func: batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: batch_norm_elemt_cuda_out
+
+# for backward compatibility
+- func: batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: batch_norm_gather_stats_cuda
+  autogen: batch_norm_gather_stats.out
+
+- func: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: batch_norm_gather_stats_with_counts_cuda
+  autogen: batch_norm_gather_stats_with_counts.out
+
+- func: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: batch_norm_backward_cpu
+    CUDA: batch_norm_backward_cuda
+    MPS: batch_norm_backward_mps
+    MkldnnCPU: mkldnn_batch_norm_backward
+  autogen: native_batch_norm_backward.out
+
+- func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: batch_norm_backward_reduce_cuda
+  autogen: batch_norm_backward_reduce.out
+
+- func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
+  dispatch:
+    CUDA: batch_norm_backward_elemt_cuda
+  autogen: batch_norm_backward_elemt.out
+
+- func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor)
+  dispatch:
+    CPU: batch_norm_update_stats_cpu
+    CUDA: batch_norm_update_stats_cuda
+  autogen: batch_norm_update_stats.out
+
+- func: is_vulkan_available() -> bool
+
+- func: _nnpack_available() -> bool
+
+- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _nnpack_spatial_convolution
+  autogen: _nnpack_spatial_convolution.out
+
+- func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: ones
+  autogen: ones.names_out
+
+- func: ones(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: ones
+
+- func: ones.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: ones_out
+
+- func: ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: ones_like
+    NestedTensorCPU, NestedTensorCUDA: ones_like
+  autogen: ones_like.out
+
+- func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
+
+- func: cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor
+
+- func: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _euclidean_dist
+  autogen: _euclidean_dist.out
+
+- func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
+  dispatch:
+    CPU, CUDA: _cdist_forward
+    MPS: _cdist_forward_mps
+  autogen: _cdist_forward.out
+  tags: core
+
+- func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
+  dispatch:
+    CPU, CUDA: _cdist_backward
+  autogen: _cdist_backward.out
+
+- func: pdist(Tensor self, float p=2) -> Tensor
+
+- func: _pdist_forward(Tensor self, float p=2) -> Tensor
+  dispatch:
+    CPU, CUDA: _pdist_forward
+  autogen: _pdist_forward.out
+  tags: core
+
+- func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
+  dispatch:
+    CPU, CUDA: _pdist_backward
+  autogen: _pdist_backward.out
+
+- func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor
+  variants: function
+
+- func: permute(Tensor(a) self, int[] dims) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: permute
+    MPS: permute_mps
+    SparseCPU, SparseCUDA: permute_sparse_coo
+  tags: core
+
+- func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
+  variants: function, method
+
+- func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a)
+  variants: function, method
+
+# moveaxis, alias for movedim
+- func: moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
+  variants: function, method
+
+- func: moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a)
+  variants: function, method
+
+# Only exposed from C++ -- in Python,
+# we expose it as an attribute `T`, not a function.
+#
+# I'd like to name this "T" in C++ too, but
+# calling a native function "T" causes undefined
+# behavior on Windows, for reasons I don't understand
+# (maybe related to capital letter collation somehow...)
+- func: numpy_T(Tensor(a) self) -> Tensor(a)
+  variants: method
+
+# Exposed on Python as an attribute 'H'
+- func: matrix_H(Tensor(a) self) -> Tensor(a)
+  variants: method
+
+# Exposed on Python as an attribute 'mT'
+- func: mT(Tensor(a) self) -> Tensor(a)
+  variants: method
+
+# Exposed on Python as an attribute 'mH'
+- func: mH(Tensor(a) self) -> Tensor(a)
+  variants: method
+
+- func: adjoint(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+
+- func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+  dispatch:
+    CPU: pixel_shuffle_cpu
+    MPS: pixel_shuffle_mps
+    CompositeExplicitAutogradNonFunctional: math_pixel_shuffle
+  autogen: pixel_shuffle.out
+
+- func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
+  dispatch:
+    CPU: pixel_unshuffle_cpu
+    MPS: pixel_unshuffle_mps
+    CompositeExplicitAutogradNonFunctional: math_pixel_unshuffle
+  autogen: pixel_unshuffle.out
+
+- func: channel_shuffle(Tensor self, SymInt groups) -> Tensor
+  dispatch:
+    CPU, CUDA: channel_shuffle
+    QuantizedCPU: channel_shuffle_quantized_cpu
+  autogen: channel_shuffle.out
+
+- func: native_channel_shuffle(Tensor self, SymInt groups) -> Tensor
+  dispatch:
+    CPU: channel_shuffle_cpu
+    CompositeImplicitAutograd: math_channel_shuffle
+
+- func: is_pinned(Tensor self, Device? device=None) -> bool
+  variants: method
+  dispatch:
+    NestedTensorCUDA, CUDA: is_pinned_cuda
+    MPS: is_pinned_mps
+    CompositeExplicitAutograd: is_pinned_default
+
+# TODO: add a copy kwarg that guarantees that the tensor is put into fresh
+# pinned memory
+- func: pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a)
+  variants: method
+
+# Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor
+- func: _pin_memory(Tensor self, Device? device=None) -> Tensor
+  dispatch:
+    CUDA: _pin_memory_cuda
+    MPS: _pin_memory_mps
+    NestedTensorCUDA, NestedTensorCPU: _pin_memory_nested
+  autogen: _pin_memory.out
+
+- func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
+  variants: function, method
+
+- func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor
+  variants: function
+
+- func: rad2deg(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: rad2deg
+    SparseCPU, SparseCUDA: rad2deg_sparse
+    SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr
+
+- func: rad2deg_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: rad2deg_
+    SparseCPU, SparseCUDA: rad2deg_sparse_
+    SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_
+
+- func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: rad2deg_out
+    SparseCPU, SparseCUDA: rad2deg_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_out
+
+- func: deg2rad(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: deg2rad
+    SparseCPU, SparseCUDA: deg2rad_sparse
+    SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr
+  tags: pointwise
+
+- func: deg2rad_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: deg2rad_
+    SparseCPU, SparseCUDA: deg2rad_sparse_
+    SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_
+  tags: pointwise
+
+- func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: deg2rad_out
+    SparseCPU, SparseCUDA: deg2rad_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_out
+  tags: pointwise
+
+- func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: scalar_tensor
+  autogen: scalar_tensor.out
+  tags: core
+
+- func: rand.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: rand
+  autogen: rand.names_out
+  tags: nondeterministic_seeded
+
+- func: rand.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: rand
+  autogen: rand.generator_with_names_out
+
+- func: rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: [core, nondeterministic_seeded]
+  dispatch:
+    CompositeExplicitAutograd: rand
+
+- func: rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: rand
+
+- func: rand.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: rand_out
+
+- func: rand.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: rand_like
+  autogen: rand_like.out
+
+- func: randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint
+
+- func: randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint
+
+- func: randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint
+
+- func: randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint
+
+- func: randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint_out
+
+- func: randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint_out
+
+- func: randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint_out
+
+- func: randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint_out
+
+- func: randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: randint_like
+  autogen: randint_like.out
+
+- func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: randint_like
+  autogen: randint_like.low_dtype_out
+
+- func: randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: [core, nondeterministic_seeded]
+  dispatch:
+    CompositeExplicitAutograd: randn
+
+- func: randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randn
+
+- func: randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: randn
+  autogen: randn.names_out
+
+- func: randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: randn
+  autogen: randn.generator_with_names_out
+
+- func: randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: randn_like
+  autogen: randn_like.out
+
+- func: randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: [core, nondeterministic_seeded]
+  dispatch:
+    CompositeExplicitAutograd: randperm
+
+- func: randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randperm
+
+- func: randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randperm_out
+
+- func: randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU: randperm_out_cpu
+    CUDA: randperm_out_cuda
+    MPS: randperm_out_mps
+
+- func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: range
+
+- func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: range
+
+- func: range.out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: range_out_no_step
+
+- func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: range_out
+    CUDA: range_cuda_out
+    MPS: range_mps_out
+  cpp_no_default_args: ['step']
+
+- func: ravel(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+
+- func: reciprocal(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: reciprocal.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: reciprocal_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: reciprocal.out
+  variants: function, method
+  tags: pointwise
+
+- func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: reciprocal_out
+    MPS: reciprocal_out_mps
+  tags: pointwise
+
+- func: neg(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: neg.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: neg_sparse
+    SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
+  tags: [core, pointwise]
+
+- func: neg_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: neg.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: neg_sparse_
+    SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
+  tags: pointwise
+
+- func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: neg_out
+    MPS: neg_out_mps
+    SparseCPU, SparseCUDA: neg_out_sparse
+    SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out
+  tags: pointwise
+# Alias for neg
+
+- func: negative(Tensor self) -> Tensor
+  variants: function, method
+
+- func: negative_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: repeat(Tensor self, SymInt[] repeats) -> Tensor
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+  dispatch:
+    CompositeExplicitAutograd: repeat
+    MPS: repeat_mps
+  autogen: repeat.out
+  tags: core
+
+- func: repeat_interleave.Tensor(Tensor repeats, *, SymInt? output_size=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU: repeat_interleave_cpu
+    CUDA: repeat_interleave_cuda
+    MPS: repeat_interleave_mps
+  tags: dynamic_output_shape
+  autogen: repeat_interleave.Tensor_out
+
+- func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: repeat_interleave_symint
+
+- func: repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: repeat_interleave_symint
+
+- func: reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: reshape_symint
+    CompositeImplicitAutogradNestedTensor: reshape_nested_symint
+
+- func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _reshape_copy_symint
+
+# NOTE [ _reshape_alias ] is meant to be used in the implementation of reshape.
+# They are not user-facing, hence the leading underscore. Please don't use it
+# anywhere else.
+- func: _reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias
+    # We don't need to support mkldnn since this is handled explicitly by the reshape operator.
+
+- func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    MkldnnCPU: mkldnn_reshape
+  autogen: _mkldnn_reshape.out
+
+- func: reshape_as(Tensor(a) self, Tensor other) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: reshape_as
+    CompositeImplicitAutogradNestedTensor: reshape_as_nested
+
+- func: round(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: round_sparse
+    SparseCsrCPU, SparseCsrCUDA: round_sparse_csr
+  tags: [core, pointwise]
+
+- func: round_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: round_sparse_
+    SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_
+  tags: pointwise
+
+- func: round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU: round_out
+    CUDA: round_out
+    MPS: round_out_mps
+    SparseCPU, SparseCUDA: round_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out
+  tags: pointwise
+
+- func: round.decimals(Tensor self, *, int decimals) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.decimals_out
+  variants: function, method
+  tags: pointwise
+
+- func: round_.decimals(Tensor(a!) self, *, int decimals) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.decimals_out
+  variants: function, method
+  tags: pointwise
+
+- func: round.decimals_out(Tensor self, *, int decimals, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU: round_decimals_out
+    CUDA: round_decimals_out
+  tags: pointwise
+
+- func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+
+- func: rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  device_check: NoCheck   # TensorIterator
+
+- func: relu(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: relu
+    MPS: relu_mps
+    MkldnnCPU: mkldnn_relu
+    QuantizedCPU: relu_quantized_cpu
+    QuantizedCUDA: relu_quantized_cuda
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
+    SparseCPU, SparseCUDA: relu_sparse
+    SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr
+  tags: [core, pointwise]
+
+- func: relu_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: relu_
+    MPS: relu_mps_
+    MkldnnCPU: mkldnn_relu_
+    QuantizedCPU: relu_quantized_cpu_
+    QuantizedCUDA: relu_quantized_cuda_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
+    SparseCPU, SparseCUDA: relu_sparse_
+    SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr_
+  autogen: relu.out
+  tags: pointwise
+
+- func: relu6(Tensor self) -> Tensor
+  python_module: nn
+
+- func: relu6_(Tensor(a!) self) -> Tensor(a!)
+  python_module: nn
+
+- func: prelu(Tensor self, Tensor weight) -> Tensor
+  variants: function, method
+  autogen: prelu.out
+
+- func: _prelu_kernel(Tensor self, Tensor weight) -> Tensor
+  dispatch:
+    CPU, CUDA: _prelu_kernel
+    QuantizedCPU: _prelu_kernel_quantized_cpu
+    MkldnnCPU: mkldnn_prelu
+    MPS: prelu_mps
+
+- func: _prelu_kernel_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: _prelu_kernel_backward
+    MkldnnCPU: mkldnn_prelu_backward
+    MPS: prelu_backward_mps
+
+- func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU: gelu_out_cpu
+    CUDA: gelu_out_cuda
+    MPS: gelu_out_mps
+
+- func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)
+  structured_delegate: gelu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    QuantizedCPU: gelu_quantized_cpu_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
+
+- func: gelu(Tensor self, *, str approximate='none') -> Tensor
+  structured_delegate: gelu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_gelu
+    QuantizedCPU: gelu_quantized_cpu
+    QuantizedCUDA: gelu_quantized_cuda
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
+  tags: [core, pointwise]
+
+- func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU: gelu_backward_out_cpu
+    CUDA: gelu_backward_out_cuda
+    MPS: gelu_backward_out_mps
+
+- func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
+  structured_delegate: gelu_backward.grad_input
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_gelu_backward
+    NestedTensorCPU, NestedTensorCUDA: gelu_backwards_nested
+  tags: pointwise
+
+- func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
+  variants: function
+  python_module: nn
+  device_check: NoCheck
+  device_guard: False
+
+- func: hardshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: hardshrink_out
+
+- func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+  structured_delegate: hardshrink.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: hardshrink_backward.grad_input(Tensor grad_out, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: hardshrink_backward_out
+
+- func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
+  structured_delegate: hardshrink_backward.grad_input
+  variants: function, method
+
+- func: rsqrt(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: rsqrt.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: rsqrt_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: rsqrt.out
+  variants: function, method
+  tags: pointwise
+
+- func: rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: rsqrt_out
+    MPS: rsqrt_out_mps
+  tags: pointwise
+
+- func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: select_symint
+    SparseCsrCPU, SparseCsrCUDA: select_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: select_nested
+  tags: core
+
+- func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: select_backward_symint
+  autogen: select_backward.out
+
+- func: _nested_select_backward(Tensor grad_output, Tensor self, int dim, SymInt index) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: _nested_select_backward_symint
+
+- func: selu(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+
+- func: selu_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: celu
+
+- func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: celu_
+  autogen: celu.out
+
+- func: silu(Tensor self) -> Tensor
+  structured_delegate: silu.out
+  python_module: nn
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu
+  tags: pointwise
+
+- func: silu_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: silu.out
+  python_module: nn
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_
+  tags: pointwise
+
+- func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: silu_out
+    MPS: silu_out_mps
+  tags: pointwise
+
+- func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: silu_backward_out
+    MPS: silu_backward_out_mps
+  tags: pointwise
+
+- func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
+  structured_delegate: silu_backward.grad_input
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: math_silu_backward
+    NestedTensorCPU, NestedTensorCUDA: silu_backward_nested
+  tags: pointwise
+
+- func: mish(Tensor self) -> Tensor
+  structured_delegate: mish.out
+  python_module: nn
+
+- func: mish_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: mish.out
+  python_module: nn
+
+- func: mish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mish_out
+    MPS: mish_out_mps
+
+- func: mish_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mish_backward
+    MPS: mish_backward_mps
+    CompositeImplicitAutograd: math_mish_backward
+
+- func: sigmoid(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sigmoid.out
+  variants: function, method
+  dispatch:
+    QuantizedCPU: sigmoid_quantized_cpu
+    MkldnnCPU: mkldnn_sigmoid
+  tags: [core, pointwise]
+
+- func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sigmoid.out
+  variants: function, method
+  dispatch:
+    MkldnnCPU: mkldnn_sigmoid_
+  tags: pointwise
+
+- func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sigmoid_out
+    MPS: sigmoid_out_mps
+  tags: pointwise
+
+- func: logit(Tensor self, float? eps=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, CUDA: logit
+    MPS: logit_mps
+  tags: pointwise
+
+- func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CPU, CUDA: logit_
+  tags: pointwise
+
+- func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logit_out
+    MPS: logit_out_mps
+  tags: pointwise
+
+- func: sin(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sin.out
+  variants: function, method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr
+    SparseCPU, SparseCUDA: sin_sparse
+    NestedTensorCPU, NestedTensorCUDA: sin_nested
+  tags: [core, pointwise]
+
+- func: sin_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sin.out
+  variants: function, method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_
+    SparseCPU, SparseCUDA: sin_sparse_
+  tags: pointwise
+
+- func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sin_out
+    MPS: sin_out_mps
+    SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out
+    SparseCPU, SparseCUDA: sin_sparse_out
+  tags: pointwise
+
+- func: sinc(Tensor self) -> Tensor
+  structured_delegate: sinc.out
+  variants: function, method
+  tags: pointwise
+
+- func: sinc_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: sinc.out
+  variants: function, method
+  tags: pointwise
+
+- func: sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sinc_out
+  tags: pointwise
+
+- func: sinh(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sinh.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sinh_sparse
+    SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr
+  tags: [core, pointwise]
+
+- func: sinh_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sinh.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sinh_sparse_
+    SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_
+  tags: pointwise
+
+- func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sinh_out
+    MPS: sinh_out_mps
+    SparseCPU, SparseCUDA: sinh_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out
+
+# Returns a copy of this `Variable` that is detached from its autograd graph.
+# This method is OK to call if the `Variable` is a view.
+#
+# NOTE: Previously, if we change the tensor metadata (e.g. sizes / strides /
+# storage / storage_offset) of a tensor created from `detach()`, those metadata
+# in the original tensor will also be updated. However, the new behavior is that
+# those metadata changes to the detached tensor will not update the original tensor
+# anymore, and in the `detach()` function we need to set `allow_tensor_metadata_change_`
+# to false to make such changes explicitly illegal, in order to prevent users from
+# changing metadata of the detached tensor and expecting the original tensor to also
+# be updated.
+  tags: pointwise
+- func: detach(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: detach
+    NestedTensorCPU, NestedTensorCUDA: detach
+
+# Like `detach()`, but modifies this `Variable` in-place. This method may
+# only be called on non-view `Variable`s. You can use `is_view()` to check
+# this. If this `Variable` is a view, throws an `std::runtime_error()`.
+- func: detach_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: detach_
+
+- func: size.int(Tensor self, int dim) -> int
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: size.Dimname(Tensor self, Dimname dim) -> int
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: sym_size.int(Tensor self, int dim) -> SymInt
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
+- func: sym_numel(Tensor self) -> SymInt
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
+- func: sym_storage_offset(Tensor self) -> SymInt
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
+- func: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: slice
+  tags: core
+
+# NOTE: The implementation of split_with_sizes bypasses the dispatcher to call this; undo
+# that if adding specific implementations here!
+
+- func: slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: slice_backward
+  autogen: slice_backward.out
+
+# NB: This op exists to back the implementation of reverse view_funcs for various views (chunk,
+# slice.Tensor, split_with_sizes, et. al.). Currently, these are only used during fake-ification
+# of PT2 graph input subclass instances that are views. This means:
+# * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it)
+# * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it)
+# * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph
+#   input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is
+#   easier to implement for a subclass than as_strided()
+- func: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: slice_inverse_symint
+
+- func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: slice_scatter
+  autogen: slice_scatter.out
+  tags: [core, view_copy]
+
+- func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: select_scatter_symint
+  autogen: select_scatter.out
+  tags: core
+
+- func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: diagonal_scatter
+  autogen: diagonal_scatter.out
+
+- func: as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: as_strided_scatter_symint
+  autogen: as_strided_scatter.out
+
+- func: smm(Tensor self, Tensor mat2) -> Tensor
+  variants: function, method
+
+# softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
+- func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+
+- func: softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: softmax_out
+
+- func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+
+- func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  structured_delegate: _softmax.out
+  dispatch:
+    MkldnnCPU: mkldnn_softmax
+    NestedTensorCPU, NestedTensorCUDA: softmax_nested
+  tags: core
+
+- func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: softmax_cpu_out
+    CUDA: softmax_cuda_out
+    MPS: softmax_mps_out
+
+- func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
+  structured_delegate: _softmax_backward_data.out
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: nested_softmax_backward
+
+- func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: softmax_backward_cpu_out
+    CUDA: softmax_backward_cuda_out
+    MPS: softmax_backward_mps_out
+
+- func: unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: unsafe_split
+  autogen: unsafe_split.Tensor_out
+
+- func: split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: split
+
+- func: split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: split_symint
+
+- func: unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: unsafe_split_with_sizes
+  autogen: unsafe_split_with_sizes.out
+
+- func: split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: split_with_sizes
+    NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested
+  tags: core
+
+- func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+  variants: function, method
+
+- func: hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+  variants: function, method
+
+- func: vsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+  variants: function, method
+
+- func: vsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+  variants: function, method
+
+- func: dsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+  variants: function, method
+
+- func: dsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+  variants: function, method
+
+- func: squeeze(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: squeeze
+    QuantizedCPU, QuantizedCUDA: squeeze_quantized
+    NestedTensorCPU, NestedTensorCUDA: squeeze_nested
+
+- func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: squeeze
+    QuantizedCPU, QuantizedCUDA: squeeze_quantized
+    NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
+  tags: core
+
+- func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+
+- func: squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: squeeze
+    QuantizedCPU, QuantizedCUDA: squeeze_quantized
+    NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
+  tags: core
+
+- func: squeeze_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: squeeze_
+
+- func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: squeeze_
+
+- func: squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: squeeze_
+
+- func: squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+
+- func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
+
+- func: sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: _sspaddmm_out_only_sparse
+    CUDA: _sspaddmm_out_only_sparse_cuda
+    SparseCPU: _sspaddmm_out_cpu
+    SparseCUDA: _sspaddmm_out_cuda
+
+- func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _chunk_cat
+
+- func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: _chunk_cat_out
+
+- func: stack(Tensor[] tensors, int dim=0) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: stack
+
+- func: stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: stack_out
+
+- func: _stack(Tensor[] tensors, int dim=0) -> Tensor
+  dispatch: # match the backends supported by _cat
+    CPU: _stack_cpu
+    CompositeExplicitAutograd: _stack
+
+- func: _stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch: # match the backends supported by _cat_out
+    CPU: _stack_out_cpu
+    CompositeExplicitAutograd: _stack_out
+
+- func: hstack(Tensor[] tensors) -> Tensor
+
+- func: hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: vstack(Tensor[] tensors) -> Tensor
+
+- func: vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: dstack(Tensor[] tensors) -> Tensor
+
+- func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+# Overload without center & pad mode, needed for forward-compatibility
+- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+  variants: function, method
+  cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized']
+
+- func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+  variants: function, method
+
+- func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
+  variants: function, method
+
+- func: stride.int(Tensor self, int dim) -> int
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: stride.Dimname(Tensor self, Dimname dim) -> int
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: sym_stride.int(Tensor self, int dim) -> SymInt
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
+- func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: sum
+    SparseCPU, SparseCUDA: sum_coo
+    SparseCsrCPU, SparseCsrCUDA: sum_csr
+  autogen: sum.out
+
+- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
+  structured_delegate: sum.IntList_out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    NestedTensorCPU: NestedTensor_sum_dim_CPU
+    SparseCPU, SparseCUDA: sum_sparse_coo
+    SparseCsrCPU, SparseCsrCUDA: sum_sparse_compressed
+  tags: core
+
+- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: sum_out
+    MPS: sum_out_mps
+
+- func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+# TODO: this function will be replaced once nested expand semantics have been settled on
+- func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor
+  dispatch:
+    NestedTensorCPU: _nested_sum_backward_cpu
+
+- func: nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, CUDA: nansum
+    MPS: nansum_mps
+
+- func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nansum_out
+    MPS: nansum_out_mps
+
+- func: sum_to_size(Tensor self, SymInt[] size) -> Tensor
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: sum_to_size_symint
+
+- func: sqrt(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sqrt.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sqrt_sparse
+    SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr
+  tags: [core, pointwise]
+
+- func: sqrt_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sqrt.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sqrt_sparse_
+    SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_
+  tags: pointwise
+
+- func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sqrt_out
+    MPS: sqrt_out_mps
+    SparseCPU, SparseCUDA: sqrt_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out
+  tags: pointwise
+
+- func: square(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: pointwise
+
+- func: square_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: pointwise
+
+- func: square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
+
+- func: std(Tensor self, bool unbiased=True) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: std
+    MPS: std_mps
+    QuantizedCPU: std_quantized_cpu
+
+- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CPU, CUDA: std_mean
+    MPS: std_mean_mps
+  autogen: std_mean.correction_out
+
+- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: std_out
+    QuantizedCPU: std_out_quantized_cpu
+
+- func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: prod
+    MPS: prod_mps
+  autogen: prod.out
+  tags: core
+
+- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: prod.int_out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: prod_out
+    MPS: prod_out_mps
+
+- func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: t(Tensor(a) self) -> Tensor(a)
+  device_check: NoCheck
+  device_guard: False
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: t
+
+- func: t_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck
+  device_guard: False
+  variants: method
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: t_
+
+- func: tan(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tan.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: tan_sparse
+    SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr
+  tags: [core, pointwise]
+
+- func: tan_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tan.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: tan_sparse_
+    SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_
+  tags: pointwise
+
+- func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: tan_out
+    MPS: tan_out_mps
+    SparseCPU, SparseCUDA: tan_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out
+  tags: pointwise
+
+- func: tanh(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tanh.out
+  variants: function, method
+  dispatch:
+    QuantizedCPU: tanh_quantized_cpu
+    MkldnnCPU: mkldnn_tanh
+    SparseCPU, SparseCUDA: tanh_sparse
+    SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh
+  tags: [core, pointwise]
+
+- func: tanh_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tanh.out
+  variants: function, method
+  dispatch:
+    MkldnnCPU: mkldnn_tanh_
+    SparseCPU, SparseCUDA: tanh_sparse_
+    SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_
+  tags: pointwise
+
+- func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: tanh_out
+    MPS: tanh_out_mps
+    SparseCPU, SparseCUDA: tanh_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out
+  tags: pointwise
+
+- func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
+  variants: function
+
+- func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+
+# TODO: namespace threshold in 'nn'
+- func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  structured_delegate: threshold.out
+  dispatch:
+    QuantizedCPU: threshold_quantized_cpu
+
+- func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  structured_delegate: threshold.out
+
+- func: threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: threshold_out
+    MPS: threshold_out_mps
+
+- func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: threshold_backward_out
+    MPS: threshold_backward_out_mps
+    SparseCPU, SparseCUDA: threshold_backward_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed_out
+
+- func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
+  variants: function
+  structured_delegate: threshold_backward.grad_input
+  dispatch:
+    MkldnnCPU: mkldnn_relu_backward
+    SparseCPU, SparseCUDA: threshold_backward_sparse
+    SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed
+    NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
+  tags: pointwise
+
+- func: tile(Tensor self, SymInt[] dims) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: tile_symint
+
+- func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: transpose
+    NestedTensorCPU, NestedTensorCUDA: transpose_nested
+
+- func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    MkldnnCPU: mkldnn_transpose
+
+- func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: transpose_
+
+- func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    MkldnnCPU: mkldnn_transpose_
+  autogen: _mkldnn_transpose.out
+
+- func: one_hot(Tensor self, int num_classes=-1) -> Tensor
+  python_module: nn
+  variants: function
+  tags: dynamic_output_shape
+
+- func: flip(Tensor self, int[] dims) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, QuantizedCPU, CUDA, QuantizedCUDA: flip
+    MPS: flip_mps
+  autogen: flip.out
+  tags: core
+
+- func: fliplr(Tensor self) -> Tensor
+  variants: function, method
+
+- func: flipud(Tensor self) -> Tensor
+  variants: function, method
+
+- func: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, MPS: roll
+    CUDA: roll_cuda
+  autogen: roll.out
+
+# default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
+
+- func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: rot90
+  autogen: rot90.out
+
+- func: trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
+
+- func: trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor
+
+- func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
+
+- func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor
+
+# Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads).
+- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU, NestedTensorCPU: transform_bias_rescale_qkv_cpu
+    CUDA, NestedTensorCUDA: transform_bias_rescale_qkv_cuda
+  autogen: _transform_bias_rescale_qkv.out
+
+- func: _nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor
+  dispatch:
+    CPU, CUDA: NestedTensor_nested_tensor_from_mask
+  autogen: _nested_tensor_from_mask.out
+
+- func: _nested_tensor_from_mask_left_aligned(Tensor t, Tensor mask) -> bool
+  dispatch:
+    CPU, CUDA: NestedTensor_nested_tensor_from_mask_left_aligned
+
+- func: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor
+  device_check: NoCheck # cpu_nested_shape_example will always be on CPU
+  dispatch:
+    CPU: nested_from_padded_generic
+    CUDA: nested_from_padded_cuda
+  autogen: _nested_from_padded.out
+
+# These private functions are temporary. They will be updated/deleted when nested tensors switch to using SymInts for their metadata representation
+- func: _nested_tensor_size(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: _nested_tensor_size
+  autogen: _nested_tensor_size.out
+
+- func: _nested_tensor_strides(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
+  autogen: _nested_tensor_strides.out
+
+- func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA, NestedTensorMeta: _nested_tensor_storage_offsets
+  autogen: _nested_tensor_storage_offsets.out
+
+# _nested_from_padded is not usable from Python, so
+# _nested_from_padded_and_nested_example is available for testing.
+- func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
+  autogen: _nested_from_padded_and_nested_example.out
+
+# The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
+# this will need to be updated
+- func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a)
+  variants: function
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: _nested_view_from_buffer
+
+- func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor
+  variants: function
+  device_check: NoCheck
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy
+  autogen: _nested_view_from_buffer_copy.out
+
+- func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a)
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor
+  variants: function
+  device_check: NoCheck
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _nested_view_from_jagged_copy
+  autogen: _nested_view_from_jagged_copy.out
+
+- func: _nested_get_values(Tensor(a) self) -> Tensor(a)
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_get_values_copy(Tensor self) -> Tensor
+  variants: function
+  device_check: NoCheck
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _nested_get_values_copy
+  autogen: _nested_get_values_copy.out
+
+- func: _nested_get_offsets(Tensor self) -> Tensor
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+# returns undefined Tensor if no lengths present
+- func: _nested_get_lengths(Tensor self) -> Tensor
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_get_ragged_idx(Tensor self) -> int
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_get_jagged_dummy(Tensor any) -> Tensor
+  category_override: dummy
+  dispatch: {}
+
+- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
+  dispatch:
+    # calls unsqueeze
+    CompositeExplicitAutogradNonFunctional: _trilinear
+  autogen: _trilinear.out
+
+- func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor
+
+- func: trunc(Tensor self) -> Tensor
+  structured_delegate: trunc.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: trunc_sparse
+    SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr
+  tags: [core, pointwise]
+
+- func: trunc_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: trunc.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: trunc_sparse_
+    SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_
+  tags: pointwise
+
+- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: trunc_out
+    MPS: trunc_out_mps
+    SparseCPU, SparseCUDA: trunc_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out
+  tags: pointwise
+# Alias for trunc
+
+- func: fix(Tensor self) -> Tensor
+  variants: function, method
+
+- func: fix_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: type_as(Tensor self, Tensor other) -> Tensor
+  variants: method
+
+- func: _has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool
+  variants: function
+
+- func: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: _unique_cpu
+    CUDA: _unique_cuda
+  autogen: _unique.out
+
+- func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: unique_dim_cpu
+    CUDA: unique_dim_cuda
+  tags: dynamic_output_shape
+  autogen: unique_dim.out
+
+- func: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: unique_consecutive_cpu
+    CUDA: unique_consecutive_cuda
+    MPS: unique_consecutive_mps
+  tags: dynamic_output_shape
+  autogen: unique_consecutive.out
+
+- func: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: unique_dim_consecutive_cpu
+    CUDA: unique_dim_consecutive_cuda
+    MPS: unique_dim_consecutive_mps
+  tags: dynamic_output_shape
+  autogen: unique_dim_consecutive.out
+
+# _unique and _unique_dim are fragile and modifying them easily cause internal break
+# the below operator is a temporary hack for adding return_counts support
+# Please don't rely on these two operators, they will be removed soon
+
+- func: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: _unique2_cpu
+    CUDA: _unique2_cuda
+    MPS: _unique2_mps
+  tags: dynamic_output_shape
+  autogen: _unique2.out
+
+- func: _unsafe_view(Tensor self, SymInt[] size) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _unsafe_view
+  autogen: _unsafe_view.out
+
+- func: unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: unsqueeze
+    SparseCPU, SparseCUDA: unsqueeze_sparse
+    QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
+    NestedTensorCPU, NestedTensorCUDA: unsqueeze_nested
+  tags: core
+
+- func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: unsqueeze_
+
+- func: vander(Tensor x, int? N=None, bool increasing=False) -> Tensor
+
+- func: var(Tensor self, bool unbiased=True) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: var
+    MPS: var_mps
+  tags: core
+
+- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: var_out
+
+- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CPU, CUDA: var_mean
+    MPS: var_mean_mps
+  autogen: var_mean.correction_out
+
+- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA, MPS: where
+  tags: [core, pointwise]
+
+- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA, MPS: where_self_out
+
+- func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
+  variants: function
+
+- func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor
+  variants: function, method
+
+- func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor
+  variants: function
+
+- func: where(Tensor condition) -> Tensor[]
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
+  variants: function
+
+# VariableType::_weight_norm does not want to be given a gap in the autograd graph,
+# so we don't define "dispatch" variants for it.
+- func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
+  variants: function
+
+- func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: weight_norm_cpu
+    CUDA: weight_norm_cuda
+    MPS: weight_norm_mps
+  autogen: _weight_norm_interface.out
+
+- func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: weight_norm_backward_cpu
+    CUDA: weight_norm_backward_cuda
+    MPS: weight_norm_backward_mps
+  autogen: _weight_norm_interface_backward.out
+
+- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
+  variants: function
+
+- func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: zeros
+  autogen: zeros.names_out
+
+- func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CPU: _efficientzerotensor
+    CUDA: _efficientzerotensor_cuda
+    MPS: _efficientzerotensor_mps
+    Meta: _efficientzerotensor_meta
+  autogen: _efficientzerotensor.out
+
+- func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: zeros_symint
+
+- func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: zeros_out
+    SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out
+
+- func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: zeros_like
+  autogen: zeros_like.out
+
+- func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _standard_gamma_grad_cpu
+    CUDA: _standard_gamma_grad_cuda
+  autogen: _standard_gamma_grad.out
+
+- func: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _s_gamma_cpu
+    CUDA: _s_gamma_cuda
+  tags: nondeterministic_seeded
+  autogen: _standard_gamma.out
+
+- func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor
+  dispatch:
+    CPU: _dirichlet_grad_cpu
+    CUDA: _dirichlet_grad_cuda
+  autogen: _dirichlet_grad.out
+
+- func: _sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor
+  tags: nondeterministic_seeded
+  variants: function
+  dispatch:
+    CPU: _s_dirichlet_cpu
+    CUDA: _s_dirichlet_cuda
+  autogen: _sample_dirichlet.out
+
+- func: poisson(Tensor self, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU: _s_poisson_cpu
+    CUDA: _s_poisson_cuda
+  tags: nondeterministic_seeded
+  autogen: poisson.out
+
+- func: binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU: _s_binomial_cpu
+    CUDA: _s_binomial_cuda
+  tags: nondeterministic_seeded
+  autogen: binomial.out
+
+# When more variants get ported to native, this dispatch will get more
+# complicated
+
+- func: native_norm(Tensor self, Scalar p=2) -> Tensor
+  dispatch:
+    SparseCPU, SparseCUDA: norm_sparse
+  autogen: native_norm.out
+
+- func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
+  dispatch:
+    SparseCPU, SparseCUDA: norm_sparse
+  autogen: native_norm.ScalarOpt_dim_dtype_out
+
+# TODO: reduce signatures down to one when optional args is available
+- func: _sparse_sum(Tensor self) -> Tensor
+
+- func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor
+
+- func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _sparse_sum
+  autogen: _sparse_sum.dim_out
+
+- func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor
+
+- func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
+  dispatch:
+    SparseCPU: _sparse_sum_backward_cpu
+    SparseCUDA: _sparse_sum_backward_cuda
+  autogen: _sparse_sum_backward.out
+
+- func: _sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  dispatch:
+    SparseCsrCPU: _sparse_csr_sum_cpu
+    SparseCsrCUDA: _sparse_csr_sum_cuda
+  autogen: _sparse_csr_sum.dim_dtype_out
+
+- func: _sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  dispatch:
+    SparseCsrCPU: _sparse_csr_prod_cpu
+    SparseCsrCUDA: _sparse_csr_prod_cuda
+  autogen: _sparse_csr_prod.dim_dtype_out
+
+- func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  python_module: sparse
+  variants: function
+
+- func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  python_module: sparse
+  variants: function
+
+- func: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  python_module: sparse
+  dispatch:
+    SparseCPU: softmax_sparse_cpu
+    SparseCUDA: softmax_sparse_cuda
+  autogen: _sparse_softmax.out
+
+- func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
+  dispatch:
+    SparseCPU: softmax_backward_sparse_cpu
+    SparseCUDA: softmax_backward_sparse_cuda
+  autogen: _sparse_softmax_backward_data.out
+
+- func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  python_module: sparse
+  variants: function
+
+- func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  python_module: sparse
+  variants: function
+
+- func: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  python_module: sparse
+  dispatch:
+    SparseCPU: log_softmax_sparse_cpu
+    SparseCUDA: log_softmax_sparse_cuda
+  autogen: _sparse_log_softmax.out
+
+- func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
+  dispatch:
+    SparseCPU: log_softmax_backward_sparse_cpu
+    SparseCUDA: log_softmax_backward_sparse_cuda
+  autogen: _sparse_log_softmax_backward_data.out
+
+- func: _spdiags(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None) -> Tensor
+  python_module: sparse
+  dispatch:
+    CPU: spdiags
+  autogen: _spdiags.out
+
+- func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: norm
+  autogen: norm.ScalarOpt_dtype_out
+
+- func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: norm
+  autogen: norm.Scalar_out
+
+- func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  structured_delegate: norm.dtype_out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_dtype_norm
+
+- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
+  structured_delegate: norm.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_norm
+
+- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: norm_dtype_out
+    MPS: norm_dtype_out_mps
+
+- func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: norm_out
+    MPS: norm_out_mps
+
+# These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
+- func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: frexp
+  tags: pointwise
+
+- func: frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent)
+  dispatch:
+    CPU, CUDA: frexp_out
+  tags: pointwise
+
+# Deprecated (v.1.12)
+- func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+  variants: function
+
+# Deprecated (v.1.12)
+- func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+
+# Deprecated (v.1.12)
+- func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor
+  variants: function
+
+# Deprecated (v.1.12)
+- func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+
+# Deprecated (v.1.12)
+- func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor
+  variants: function
+
+# Deprecated (v.1.12)
+- func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+
+- func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: clone
+    SparseCPU, SparseCUDA: clone_sparse
+    SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed
+    MkldnnCPU: mkldnn_clone
+    QuantizedCPU, QuantizedCUDA: quantized_clone
+    NestedTensorCPU, NestedTensorCUDA: clone_nested
+  autogen: clone.out
+  tags: [core, pointwise]
+
+- func: positive(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  tags: pointwise
+
+- func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: resize_as_
+  autogen: resize_as, resize_as.out
+  tags: inplace_view
+
+- func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: resize_as_sparse_
+    SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_compressed_
+  autogen: resize_as_sparse, resize_as_sparse.out
+
+- func: zero_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: zero_
+    MPS: zero_mps_
+    Meta: zero_meta_
+    SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
+    MkldnnCPU: mkldnn_zero_
+    NestedTensorCPU, NestedTensorCUDA: zero_nested_
+  autogen: zero, zero.out
+
+- func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sub_out
+    MPS: sub_out_mps
+    SparseCPU, SparseCUDA: sub_out_sparse
+  tags: pointwise
+
+- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: sub.out
+  dispatch:
+    SparseCPU, SparseCUDA: sub_sparse
+    ZeroTensor: sub_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor
+  tags: [core, pointwise]
+
+- func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: sub.out
+  dispatch:
+    SparseCPU, SparseCUDA: sub_sparse_
+  tags: pointwise
+# For C++ only, until we have conversion from C++ numbers to Tensor
+
+- func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: sub
+  tags: [core, pointwise]
+
+- func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: sub_
+  autogen: sub.Scalar_out
+  tags: pointwise
+# subtract, alias for sub
+
+- func: subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+
+- func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function, method
+
+- func: subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+
+# For C++ only, until we have conversion from C++ numbers to Tensor
+- func: subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  variants: function, method
+
+- func: subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+
+- func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CPU, CUDA: rsub
+  autogen: rsub.Tensor_out
+
+- func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: heaviside_out
+  tags: pointwise
+
+- func: heaviside(Tensor self, Tensor values) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: heaviside.out
+  tags: pointwise
+
+- func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: heaviside.out
+
+# For C++ only, until we have conversion from C++ numbers to Tensor
+- func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: rsub
+  autogen: rsub.Scalar_out
+
+# Functionally the same as addmm, but we give it a different derivative formula
+# that doesn't propagate gradients to non-present entries on sparse.
+  tags: pointwise
+- func: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  python_module: sparse
+  dispatch:
+    CompositeExplicitAutograd: _sparse_addmm
+  autogen: _sparse_addmm.out
+
+- func: sparse_sampled_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  python_module: sparse
+  dispatch:
+    SparseCsrCUDA: sparse_sampled_addmm_out_sparse_csr_cuda
+    SparseCsrCPU: sparse_sampled_addmm_out_sparse_csr_cpu
+
+- func: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  python_module: sparse
+  dispatch:
+    SparseCsrCUDA: sparse_sampled_addmm_sparse_csr_cuda
+    SparseCsrCPU: sparse_sampled_addmm_sparse_csr_cpu
+
+- func: _sparse_mm_reduce_impl(Tensor self, Tensor other, str reduce) -> (Tensor, Tensor)
+  python_module: sparse
+  dispatch:
+    SparseCsrCPU: _sparse_mm_reduce_impl_sparse_csr_cpu
+
+- func: _sparse_mm_reduce_impl_backward(Tensor self, Tensor grad_out, Tensor weight, str reduce, Tensor arg_out, bool[2] output_mask) -> (Tensor, Tensor)
+  python_module: sparse
+  dispatch:
+    SparseCsrCPU: _sparse_mm_reduce_impl_backward_sparse_csr_cpu
+
+- func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: addmm_out_cpu
+    CUDA: addmm_out_cuda
+    MPS: addmm_out_mps
+    SparseCPU: addmm_out_sparse_dense_cpu
+    SparseCUDA: addmm_out_sparse_dense_cuda
+    SparseCsrCPU: addmm_out_sparse_compressed_cpu
+    SparseCsrCUDA: addmm_out_sparse_compressed_cuda
+
+- func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  structured_delegate: addmm.out
+  variants: function, method
+  dispatch:
+    SparseCPU: addmm_sparse_dense_cpu
+    SparseCUDA: addmm_sparse_dense_cuda
+    SparseCsrCPU, SparseCsrCUDA: addmm_sparse_compressed_dense
+  tags: core
+
+- func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  structured_delegate: addmm.out
+  variants: method
+  dispatch:
+    # Warning!  For whatever reason, the inplace sparse addmm is NON
+    # broadcasting
+    SparseCPU: s_addmm_sparse_dense_cpu_
+    SparseCUDA: s_addmm_sparse_dense_cuda_
+
+- func: _addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: addmm_activation_out_cpu
+    CUDA: addmm_activation_out_cuda
+
+- func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
+  structured_delegate: _addmm_activation.out
+  variants: function, method
+
+- func: _scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _scaled_mm_cuda
+
+- func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!))
+  variants: function
+  dispatch:
+    CUDA: _scaled_mm_out_cuda
+
+# NOTE [ Sparse: autograd and API ]
+#
+#
+# Sparse Tensor Constructors
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The API entry points to sparse tensor construction should be
+# `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`. Depending on whether the
+# indices and values tensors are given, they eventually dispatch to either
+# `sparse_coo_tensor_with_dims` or `sparse_coo_tensor_with_dims_and_tensors`.
+#
+# The autograd support for ctor is implement on `sparse_coo_tensor_with_dims_and_tensors`.
+#
+# The API methods `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`
+# **must not** have specific type dispatches because otherwise codegen will
+# consider them as abstract methods (see Note [Abstract ATen methods]), dispatch
+# using **Tensor** type, and thus lose autograd tracking on the actual method
+# they dispatch to, e.g., `sparse_coo_tensor_with_dims_and_tensors`.
+#
+#
+# Sparse Methods API Design
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Goals: 1. Flexible API for users to write custom sparse ops
+#        2. ctor and member accessor with autograd support
+#
+# To achieve 1, we need to provide a set of *dangerous* APIs (dangerous in the
+# sense that misusing them will break sparse tensor invariant and may out in
+# unexpected behavior, e.g., crash). These methods are all prefixed with
+# underscore "_" to indicate that they should be used with care. We provide:
+#
+#   + `_indices()`: returns the *raw* indices within the sparse tensor (not just
+#                   sharing storage). Any inplace operation will change the
+#                   actual indices, including t_, set_, as_strided_, resize_,
+#                   etc.
+#   + `_values()`: returns the *raw* values within the sparse tensor. Similar
+#                  semantics as `_indices()`
+#   + `_nnz()`: returns the number of non-zero entries. This will always be
+#               determined by the shapes of indices and values.
+#   + `_coalesced_(bool)`: inplace sets whether the tensor is coalesced, and
+#                          returns itself.
+#
+# These methods are very useful in writing new operations, e.g., a custom
+# autograd Function.
+#
+# We also provide other public *safe* APIs:
+#   + `indices()`: returns a **view** of the indices tensor if the sparse tensor
+#                  is **coalesced**.
+#   + `values()`: returns a **view** of the values tensor if the containing
+#                 sparse tensor is **coalesced**.
+#   + `sparse_dim()`: number of sparse dimensions
+#   + `dense_dim()`: number of dense dimensions
+#   + `is_coalesced()`: whether the sparse tensor is coalesced
+#
+# `_indices()` and `_values()` should returns the raw indices and values dense
+# tensors within a sparse tensor. They can be quite unsafe with inplace
+# operations like `t_()`, and exposes uncoalesced indices and values. The public
+# recommended API is `indices()` and `values()`, both of which first check that
+# the tensor is coalesced and return views on those tensors.
+#
+#
+# Autograd Support
+# ~~~~~~~~~~~~~~~~
+#
+# Autograd is supported on `values()` and sparse tensor ctor with indices and
+# values tensors. E.g., `torch.sparse_coo_tensor(i, v).values().sum()` is
+# differentiable w.r.t. `v`.
+#
+# NB: The `values()` and `_values()` operators are special in that they are
+# layout-aware, i.e., the output depends not just on the data it represents, but
+# also on the input layout details (in this case, the `indices` tensor). See
+# NOTE [ as_strided Backward and layout-aware/agnostic autograd ] in Functions.cpp
+# for discussion on layout-aware vs layout-agnostic autograd. Since PyTorch ops
+# operate in the layout-agnostic mode, similar to `as_strided`, backward of
+# these two operators need to consider them in a layout-agnostic way:
+#   + `values()`:
+#     Input is coalesced.
+#     We just pretend having `input.indices()` as an additional argument
+#     `input_indices`, then forward is similar to
+#     `input.to(kStrided).index_select(input_indices)` regardless of the layout.
+#     Note that `values()` normally is layout-aware even if we constrain
+#     ourselves on sparse inputs since it may include all zeros values entries
+#     as "present" entries.
+#   + `_values()`:
+#     Input may be uncoalesced.
+#     It is not straightforward to construct a layout-agnostic version because
+#     duplicate indices entries may exist and additional parameterization is
+#     needed to distribute the value into different values entries. Furthermore,
+#     this op is intended to provide ways to write custom sparse ops, rather
+#     than being used in autograd graph, so it is marked as *non-differentiable*
+#     in derivatives.yaml.
+#
+# Before reading the following, see NOTE [ Autograd Variable Views ] in
+# variable.h for details on views that are tracked by autograd, and views that
+# are not.
+#
+# Moreover, these methods return tensors that share storage with inputs, so we
+# mark these methods as view ops to support autograd history tracking.
+# The sparse tensor ctor output should technically be view of both input indices
+# and values tensors, but currently we only support setting as view of a single
+# Variable, so it is only view of the values tensor.
+# TODO: clone indices in sparse tensor ctor.
+#
+# For other methods that return outputs that share storage with inputs, i.e.,
+# `indices()` and `_indices()`. We mark their outputs as non-differentiable, so
+# the view relation is not tracked by autograd, but the version counter is still
+# shared. In other words, their outputs are non-differentiable views of the
+# sparse tensor.
+# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
+# the default would never make sense.
+
+- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: sparse_compressed_tensor
+
+- func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+
+- func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: sparse_compressed_tensor
+- func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+
+- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _sparse_compressed_tensor_unsafe_symint
+
+- func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+
+- func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: sparse_coo_tensor
+  autogen: sparse_coo_tensor.size_out
+
+- func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+
+- func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+
+- func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint
+
+- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> ()
+
+- func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
+- func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
+- func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
+- func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
+- func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
+
+- func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_sparse
+  autogen: _sparse_coo_tensor_with_dims.out
+
+- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse_symint
+  autogen: _sparse_coo_tensor_with_dims_and_tensors.out
+
+- func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: sparse_resize_
+  autogen: sparse_resize, sparse_resize.out
+
+- func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: sparse_resize_and_clear_
+  autogen: sparse_resize_and_clear, sparse_resize_and_clear.out
+
+- func: sparse_mask(Tensor self, Tensor mask) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_mask
+    SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_compressed
+  autogen: sparse_mask.out
+
+- func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_mask_projection
+  autogen: _sparse_mask_projection.out
+
+- func: _to_cpu(Tensor[] tensors) -> Tensor[]
+  variants: function
+
+- func: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor
+  variants: method
+
+# Special case of to_dense with custom derivative
+- func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_to_dense
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense
+    MkldnnCPU: mkldnn_to_dense
+  autogen: _to_dense.out
+
+- func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor
+
+- func: sparse_dim(Tensor self) -> int
+  variants: method
+  dispatch:
+    CPU, CUDA: sparse_dim_strided
+    SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr
+  device_check: NoCheck
+  device_guard: False
+
+# legacy method
+- func: _dimI(Tensor self) -> int
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_dim_sparse
+  device_check: NoCheck
+  device_guard: False
+
+- func: dense_dim(Tensor self) -> int
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_dim_strided
+    SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
+  device_check: NoCheck
+  device_guard: False
+
+# legacy method
+- func: _dimV(Tensor self) -> int
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
+  device_check: NoCheck
+  device_guard: False
+
+- func: _nnz(Tensor self) -> int
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _nnz_sparse_csr
+  device_check: NoCheck
+  device_guard: False
+
+# NOTE: [ coalesce autograd ]
+# coalesce returns self directly for already coalesced sparse tensors.
+# This means coalesce cannot have a derivative registered, otherwise it creates
+# circular references in the autograd graph (see gh-52874).
+# Instead, the derivative is registered on the slow-path "_coalesce"
+- func: coalesce(Tensor(a) self) -> Tensor(a)
+  variants: method
+
+- func: _coalesce(Tensor self) -> Tensor
+  dispatch:
+    SparseCPU: _coalesce_sparse_cpu
+    SparseCUDA: _coalesce_sparse_cuda
+  autogen: _coalesce.out
+
+- func: is_coalesced(Tensor self) -> bool
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: is_coalesced_sparse
+    CompositeExplicitAutograd: is_coalesced_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: _indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: _indices_sparse
+  device_check: NoCheck
+  device_guard: False
+
+- func: _values(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: _values_sparse
+  device_check: NoCheck
+  device_guard: False
+
+# This method doesn't do any check but only directly sets the flag. So it can be
+# a bit unsafe. Similar to _indices and _values, this is useful for implementing
+# custom sparse operations in Python/C++ extension.
+- func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: _coalesced_sparse_
+  device_check: NoCheck
+  device_guard: False
+  autogen: _coalesced, _coalesced.out
+
+- func: indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: indices_sparse
+    CompositeExplicitAutograd: indices_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: values(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: values_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: values_nested
+    CompositeExplicitAutograd: values_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: crow_indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: crow_indices_sparse_csr
+    CompositeExplicitAutograd: crow_indices_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: col_indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: col_indices_sparse_csr
+    CompositeExplicitAutograd: col_indices_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: ccol_indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ccol_indices_sparse_csr
+    CompositeExplicitAutograd: ccol_indices_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: row_indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: row_indices_sparse_csr
+    CompositeExplicitAutograd: row_indices_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    SparseCPU: hspmm_out_sparse_cpu
+    SparseCUDA: hspmm_out_sparse_cuda
+
+- func: hspmm(Tensor mat1, Tensor mat2) -> Tensor
+  dispatch:
+    SparseCPU: hspmm_sparse_cpu
+    SparseCUDA: hspmm_sparse_cuda
+
+- func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  device_check: NoCheck  # Allows copy into different device
+  variants: function
+  dispatch:
+    SparseCPU, SparseCUDA: copy_sparse_
+  autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
+
+# By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
+- func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: unbind
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
+
+- func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
+  variants: function, method
+
+- func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+  variants: method
+
+# Special case of to_sparse.sparse_dim with custom derivative
+- func: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse
+    SparseCPU, SparseCUDA: sparse_coo_to_sparse
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
+  autogen: _to_sparse.sparse_dim_out
+
+- func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+  variants: method
+
+# Special case of to_sparse with custom derivative
+- func: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse
+    SparseCPU, SparseCUDA: sparse_coo_to_sparse
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
+  autogen: _to_sparse.out
+
+- func: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+  variants: method
+
+# Special case of to_sparse_csr with custom derivative
+- func: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_csr
+    SparseCPU, SparseCUDA: coo_to_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr
+  autogen: _to_sparse_csr.out
+
+- func: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+  variants: method
+
+# Special case of to_sparse_csc with custom derivative
+- func: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_csc
+    SparseCPU, SparseCUDA: coo_to_sparse_csc
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc
+  autogen: _to_sparse_csc.out
+
+- func: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+  variants: method
+
+# Special case of to_sparse_bsr with custom derivative
+- func: _to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_bsr
+    SparseCPU, SparseCUDA: coo_to_sparse_bsr
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr
+  autogen: _to_sparse_bsr.out
+
+- func: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+  variants: method
+
+# Special case of to_sparse_bsc with custom derivative
+- func: _to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_bsc
+    SparseCPU, SparseCUDA: coo_to_sparse_bsc
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc
+  autogen: _to_sparse_bsc.out
+
+- func: _to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _to_sparse_semi_structured
+
+- func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU: dense_to_mkldnn
+  autogen: to_mkldnn.out
+
+- func: mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
+  variants: function
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_reorder_conv2d_weight
+  autogen: mkldnn_reorder_conv2d_weight.out
+
+- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+  variants: function
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_reorder_conv3d_weight
+  autogen: mkldnn_reorder_conv3d_weight.out
+
+- func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor
+
+- func: quantize_per_tensor_dynamic(Tensor self, ScalarType dtype, bool reduce_range) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: quantize_per_tensor_dynamic
+  autogen: quantize_per_tensor_dynamic.out
+
+- func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: quantize_per_tensor
+  autogen: quantize_per_tensor.out
+
+- func: quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: quantize_per_tensor_tensor_qparams
+  autogen: quantize_per_tensor.tensor_qparams_out
+
+- func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[]
+  variants: function
+  dispatch:
+    CPU: quantize_per_tensor_list_cpu
+  autogen: quantize_per_tensor.tensors_out
+
+- func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: quantize_per_channel
+  autogen: quantize_per_channel.out
+
+- func: dequantize.self(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, CUDA: dequantize_cpu_or_cuda
+    QuantizedCPU, QuantizedCUDA: dequantize_quantized
+  autogen: dequantize.self_out
+
+- func: dequantize.tensors(Tensor[] tensors) -> Tensor[]
+  variants: function
+  dispatch:
+    QuantizedCPU: dequantize_tensors_quantized_cpu
+  autogen: dequantize.tensors_out
+
+- func: q_scale(Tensor self) -> float
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: q_scale_quant
+
+- func: q_zero_point(Tensor self) -> int
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: q_zero_point_quant
+
+- func: q_per_channel_scales(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: q_per_channel_scales
+  autogen: q_per_channel_scales.out
+
+- func: q_per_channel_zero_points(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: q_per_channel_zero_points
+  autogen: q_per_channel_zero_points.out
+
+- func: q_per_channel_axis(Tensor self) -> int
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: q_per_channel_axis
+
+- func: int_repr(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    QuantizedCPU: int_repr_quantized_cpu
+    QuantizedCUDA: int_repr_quantized_cuda
+  autogen: int_repr.out
+
+- func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor
+  dispatch:
+    CPU: make_per_tensor_quantized_tensor_cpu
+    CUDA: make_per_tensor_quantized_tensor_cuda
+  autogen: _make_per_tensor_quantized_tensor.out
+
+- func: _make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor
+  dispatch:
+    CPU: make_per_channel_quantized_tensor_cpu
+    CUDA: make_per_channel_quantized_tensor_cuda
+  autogen: _make_per_channel_quantized_tensor.out
+
+- func: qscheme(Tensor self) -> QScheme
+  variants: method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: qscheme_quant
+
+- func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: fake_quantize_per_tensor_affine.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_tensor_affine_cachemask
+  autogen: fake_quantize_per_tensor_affine_cachemask.out
+
+- func: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams
+  autogen: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out
+
+- func: fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor
+  variants: function
+
+- func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_tensor_affine
+  autogen: _fake_quantize_learnable_per_tensor_affine.out
+
+- func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_tensor_affine_backward
+
+- func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: fake_quantize_per_channel_affine_cachemask(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_channel_affine_cachemask
+  autogen: fake_quantize_per_channel_affine_cachemask.out
+
+- func: fake_quantize_per_channel_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor
+  variants: function
+
+- func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_channel_affine
+  autogen: _fake_quantize_learnable_per_channel_affine.out
+
+- func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_channel_affine_backward
+
+- func: fused_moving_avg_obs_fake_quant(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> Tensor
+  variants: function
+
+- func: _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask)
+  dispatch:
+    CPU: fused_moving_avg_obs_fake_quant_cpu
+    CUDA: fused_moving_avg_obs_fake_quant_cuda
+  autogen: _fused_moving_avg_obs_fq_helper_functional, _fused_moving_avg_obs_fq_helper.out
+
+- func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)
+  variants: function
+
+- func: _saturate_weight_to_fp16(Tensor weight) -> Tensor
+  variants: function
+
+- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)
+  variants: function
+
+- func: _autocast_to_reduced_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) -> Tensor(a)
+  variants: method
+  device_guard: False
+
+- func: _autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a)
+  variants: method
+  device_guard: False
+
+- func: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: _to_copy
+    NestedTensorCPU, NestedTensorCUDA: _to_copy_nested
+  autogen: _to_copy.out
+  tags: core
+
+# to(Device) must not exist because all constructors of Device also works for
+# TensorOptions. Otherwise, an ambiguity error is thrown.
+# See NOTE [ TensorOptions Constructors ].
+- func: to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: to.device(Tensor(a) self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: to.other(Tensor(a) self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: meshgrid(Tensor[] tensors) -> Tensor[]
+
+# TODO: Two weeks after this lands, combine these two overloads,
+#       making "indexing" optional. These are temporarily distinct for
+#       forward-compatibility reasons.
+- func: meshgrid.indexing(Tensor[] tensors, *, str indexing) -> Tensor[]
+
+- func: cartesian_prod(Tensor[] tensors) -> Tensor
+  variants: function
+
+- func: combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor
+  variants: function
+
+- func: item(Tensor self) -> Scalar
+  tags: data_dependent_output
+  variants: method
+
+- func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType
+  variants: function
+
+- func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType
+  variants: function
+
+- func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType
+  variants: function
+
+- func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
+
+- func: can_cast(ScalarType from, ScalarType to) -> bool
+  variants: function
+
+- func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
+  variants: function
+
+# NB: Does NOT check precondition that numel == 1
+- func: _local_scalar_dense(Tensor self) -> Scalar
+  tags: [core, data_dependent_output]
+  dispatch:
+    CPU: _local_scalar_dense_cpu
+    CUDA: _local_scalar_dense_cuda
+    MPS: _local_scalar_dense_mps
+  variants: function
+
+# MPS LSTM implementation
+
+- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    MPS: _lstm_mps
+  autogen: _lstm_mps.out
+  tags: nondeterministic_seeded
+
+- func: lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+  dispatch:
+    MPS: lstm_mps_backward
+  autogen: lstm_mps_backward.out
+
+
+# Fused RNN kernels
+- func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _thnn_fused_lstm_cell_cuda
+  autogen: _thnn_fused_lstm_cell.out
+
+# NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs
+#     It is necessary to avoid triggering TensorImpl use count checks in debug mode
+# NB: this is function is NOT differentiable
+- func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _thnn_fused_lstm_cell_backward_impl_cuda
+  autogen: _thnn_fused_lstm_cell_backward_impl.out
+
+- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+
+- func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+
+- func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: _thnn_fused_gru_cell_cuda
+  autogen: _thnn_fused_gru_cell.out
+
+- func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _thnn_fused_gru_cell_backward_cuda
+  autogen: _thnn_fused_gru_cell_backward.out
+
+- func: _thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+
+# RNN cells and layers
+- func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
+
+- func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+
+- func: rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+
+- func: rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+
+# Quantized RNN layer registration has been moved to C10 dispatch in `RNN.cpp`
+
+# Quantized RNN layers
+# - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
+
+
+# - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
+
+
+# Quantized GRU layers
+
+# - func: quantized_gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+#
+
+# - func: quantized_gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+#
+
+# Quantized RNN cells
+- func: quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor)
+
+- func: quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
+
+- func: quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
+
+- func: quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
+
+# PackedSequence utilities
+- func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
+  dispatch:
+    CompositeExplicitAutograd: _pack_padded_sequence
+  autogen: _pack_padded_sequence.out
+
+- func: _pack_padded_sequence_backward(Tensor grad, SymInt[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _pack_padded_sequence_backward_symint
+
+- func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor)
+
+# wrappers for legacy TH methods
+
+- func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, Meta, MPS: set_
+  autogen: set.source_Storage, set.source_Storage_out
+  tags: inplace_view
+
+- func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU: set_storage_cpu_
+    Meta: set_storage_meta__symint
+    CUDA: set_storage_cuda_
+    MPS: set_storage_mps_
+    QuantizedCPU, QuantizedCUDA: set_storage_quantized_
+  autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out
+  tags: inplace_view
+
+- func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: set__symint
+  tags: inplace_view
+
+- func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, Meta, MPS: set_tensor_
+  autogen: set.source_Tensor, set.source_Tensor_out
+  tags: inplace_view
+
+- func: set_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU: set_cpu_
+    CUDA: set_cuda_
+    Meta: set_meta_
+    MPS: set_mps_
+  autogen: set, set.out
+  tags: inplace_view
+
+# Not making it CompositeImplicitAutograd because lift
+# should be a primitive w.r.t. functorch
+
+# TODO: this should have a view annotation
+# TODO: shouldn't be a method
+- func: lift(Tensor self) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: lift
+  autogen: lift.out
+
+# lift_fresh is called with an argument that is guaranteed to be
+# fresh (i.e., newly allocated).  This is ONLY called from a
+# torch.tensor call; if you FX trace a lift_fresh, you are obligated
+# to convert this into a lift_fresh_copy (because FX will violate the
+# freshness invariant when tracing).
+- func: lift_fresh(Tensor(a) self) -> Tensor(a)
+  dispatch:
+    CompositeExplicitAutograd: lift_fresh
+
+# Like lift, but it clones the input.
+- func: lift_fresh_copy(Tensor self) -> Tensor
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: lift_fresh_copy
+  autogen: lift_fresh_copy.out
+
+- func: is_set_to(Tensor self, Tensor tensor) -> bool
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, MPS: is_set_to
+
+- func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU: masked_fill__cpu
+    CUDA: masked_fill__cuda
+    QuantizedCPU: masked_fill__quantized_cpu
+    QuantizedCUDA: masked_fill__quantized_cuda
+    MPS: masked_fill__mps
+  autogen: masked_fill.Scalar_out
+
+- func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: masked_fill
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill
+  tags: pointwise
+
+- func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU: masked_fill__cpu
+    CUDA: masked_fill__cuda
+    QuantizedCPU: masked_fill__quantized_cpu
+    QuantizedCUDA: masked_fill__quantized_cuda
+    MPS: masked_fill__mps
+  autogen: masked_fill.Tensor_out
+
+- func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: masked_fill
+
+- func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU: masked_scatter__cpu
+    CUDA: masked_scatter__cuda
+    MPS: masked_scatter__mps
+  autogen: masked_scatter.out
+
+- func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: masked_scatter
+
+- func: masked_scatter_backward(Tensor grad_output, Tensor mask, SymInt[] sizes) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: masked_scatter_backward_symint
+
+- func: _masked_softmax(Tensor self, Tensor mask, int? dim=None, int? mask_type=None) -> Tensor
+  dispatch:
+    CUDA: masked_softmax_cuda
+    CPU: masked_softmax_cpu
+  autogen: _masked_softmax.out
+
+- func: _masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor
+  dispatch:
+    CUDA: masked_softmax_backward_cuda
+    CPU: masked_softmax_backward_cpu
+  autogen: _masked_softmax_backward.out
+
+- func: view(Tensor(a) self, SymInt[] size) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
+    MkldnnCPU: mkldnn_view
+    NestedTensorCPU, NestedTensorCUDA: view_nested
+  tags: core
+
+# Warning: If you want to change the name or overload name of this
+# operator, you might also want to change the `isBlockListedSchema`
+# function in `torch/csrc/jit/frontend/schema_catching.cpp`.
+# The name and overload name of this operator is hardcoded in that
+# function in order to workaround a bug:
+# https://github.com/pytorch/pytorch/issues/47964
+- func: view.dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: view_dtype
+
+- func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU, CUDA: put_
+  autogen: put.out
+
+- func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: put
+
+- func: index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU: index_add_cpu_out
+    CUDA: index_add_cuda_out
+    MPS: index_add_mps_out
+
+- func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor(a!)
+  structured_delegate: index_add.out
+  variants: method
+
+- func: index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
+  structured_delegate: index_add.out
+  variants: function, method
+
+- func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
+  variants: function, method
+
+- func: index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU: index_reduce_cpu_out
+    CUDA: index_reduce_cuda_out
+
+- func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!)
+  structured_delegate: index_reduce.out
+  variants: method
+
+- func: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor
+  structured_delegate: index_reduce.out
+  variants: function, method
+
+- func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU: index_fill_
+    CUDA: index_fill_
+    MPS: index_fill_mps_
+  autogen: index_fill.int_Scalar_out
+
+- func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: index_fill
+
+- func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: index_fill_
+    MPS: index_fill_mps_
+  autogen: index_fill.int_Tensor_out
+
+- func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: index_fill
+
+- func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+  structured_delegate: scatter.src_out
+  variants: function, method
+  tags: core
+
+- func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+  structured_delegate: scatter.src_out
+  variants: method
+
+- func: scatter.src_out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_src_out
+    MPS: scatter_src_out_mps
+
+- func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+  structured_delegate: scatter.value_out
+  variants: function, method
+  tags: core
+
+- func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+  structured_delegate: scatter.value_out
+  variants: method
+
+- func: scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_value_out
+    MPS: scatter_value_out_mps
+
+- func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor
+  structured_delegate: scatter.reduce_out
+  variants: function, method
+
+- func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!)
+  structured_delegate: scatter.reduce_out
+  variants: method
+
+- func: scatter.reduce_out(Tensor self, int dim, Tensor index, Tensor src, *, str reduce, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_reduce_out
+    MPS: scatter_reduce_out_mps
+
+- func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor
+  structured_delegate: scatter.value_reduce_out
+  variants: function, method
+
+- func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!)
+  structured_delegate: scatter.value_reduce_out
+  variants: method
+
+- func: scatter.value_reduce_out(Tensor self, int dim, Tensor index, Scalar value, *, str reduce, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_value_reduce_out
+    MPS: scatter_value_reduce_out_mps
+
+- func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
+  variants: function, method
+
+- func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+  variants: function, method
+
+- func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+  structured_delegate: scatter_add.out
+  variants: function, method
+  tags: core
+
+- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+  structured_delegate: scatter_add.out
+  variants: method
+
+- func: scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_add
+    MPS: scatter_add_mps_out
+
+- func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
+  variants: function, method
+
+- func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
+  structured_delegate: scatter_reduce.two_out
+  variants: function, method
+  tags: core
+
+- func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)
+  structured_delegate: scatter_reduce.two_out
+  variants: method
+
+- func: scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_reduce_two
+
+- func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: eq.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: eq.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    CPU, CUDA: bitwise_and_out
+    MPS: bitwise_and_out_mps
+  tags: pointwise
+
+- func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_and_out
+  tags: pointwise
+
+- func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_and
+  tags: [core, pointwise]
+
+- func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_and
+  autogen: bitwise_and.Scalar_Tensor_out
+  tags: pointwise
+
+- func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: bitwise_and.Tensor_out
+  tags: [core, pointwise]
+
+- func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_and_
+  tags: pointwise
+
+- func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_and.Tensor_out
+  tags: pointwise
+
+- func: __and__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+
+- func: __and__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+
+- func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    CPU, CUDA: bitwise_or_out
+    MPS: bitwise_or_out_mps
+  tags: pointwise
+
+- func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_or_out
+  tags: pointwise
+
+- func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_or
+  tags: [core, pointwise]
+
+- func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_or
+  autogen: bitwise_or.Scalar_Tensor_out
+  tags: pointwise
+
+- func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: bitwise_or.Tensor_out
+  tags: [core, pointwise]
+
+- func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_or_
+  tags: pointwise
+
+- func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_or.Tensor_out
+  tags: pointwise
+
+- func: __or__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+
+- func: __or__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+
+- func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    CPU, CUDA: bitwise_xor_out
+    MPS: bitwise_xor_out_mps
+  tags: pointwise
+
+- func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_xor_out
+  tags: pointwise
+
+- func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_xor
+  tags: [core, pointwise]
+
+- func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_xor
+  autogen: bitwise_xor.Scalar_Tensor_out
+  tags: pointwise
+
+- func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: bitwise_xor.Tensor_out
+  tags: [core, pointwise]
+
+- func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_xor_
+  tags: pointwise
+
+- func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_xor.Tensor_out
+  tags: pointwise
+
+- func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: __xor__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: __lshift__
+  tags: pointwise
+
+- func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: __lshift__
+  tags: pointwise
+
+- func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: __ilshift__
+  autogen: __lshift__.Scalar_out
+  tags: pointwise
+
+- func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: __ilshift__
+  autogen: __lshift__.Tensor_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: bitwise_left_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_left_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_left_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: bitwise_left_shift_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift
+  tags: pointwise
+
+- func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift_
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift
+  autogen: bitwise_left_shift.Scalar_Tensor_out
+  tags: pointwise
+
+- func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: __rshift__
+  tags: pointwise
+
+- func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: __rshift__
+  tags: pointwise
+
+- func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: __irshift__
+  autogen: __rshift__.Scalar_out
+
+- func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: __irshift__
+  autogen: __rshift__.Tensor_out
+
+- func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: bitwise_right_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_right_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_right_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: bitwise_right_shift_out
+  tags: pointwise
+
+- func: bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift
+  tags: pointwise
+
+- func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift_
+  tags: pointwise
+
+- func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift_out
+  tags: pointwise
+
+- func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift
+  autogen: bitwise_right_shift.Scalar_Tensor_out
+  tags: pointwise
+
+- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+  structured_delegate: tril.out
+  variants: method
+
+- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+  structured_delegate: triu.out
+  variants: method
+
+- func: digamma_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: digamma.out
+  variants: method
+  tags: pointwise
+
+- func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: lerp.Scalar_out
+  tags: pointwise
+
+- func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: lerp.Tensor_out
+  tags: pointwise
+
+- func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU, CUDA: addbmm_
+    MPS: addbmm_mps_
+
+- func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addbmm_out
+    MPS: addbmm_out_mps
+
+- func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: addbmm
+    MPS: addbmm_mps
+
+- func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: random_
+    Meta: random_meta_
+    MPS: random_mps_
+  autogen: random.from, random.from_out
+
+- func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: random_
+    Meta: random_meta_
+    MPS: random_mps_
+  autogen: random.to, random.to_out
+
+- func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: random_
+    MPS: random_mps_
+    Meta: random_meta_
+  autogen: random, random.out
+
+- func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: uniform_
+    MPS: uniform_mps_
+    Meta: uniform_meta_
+  autogen: uniform, uniform.out
+
+- func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: cauchy_
+  autogen: cauchy, cauchy.out
+
+- func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: log_normal_
+  autogen: log_normal, log_normal.out
+
+- func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: exponential_
+    MPS: exponential_mps_
+  autogen: exponential, exponential.out
+
+- func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: geometric_
+
+  # wrappers for TH functions
+  autogen: geometric, geometric.out
+
+- func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: diag(Tensor self, int diagonal=0) -> Tensor
+  variants: method, function
+
+- func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
+  variants: method, function
+
+- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: triu_cpu
+    CUDA: triu_cuda
+    MPS: triu_mps_out
+
+- func: triu(Tensor self, int diagonal=0) -> Tensor
+  structured_delegate: triu.out
+  variants: method, function
+
+- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: tril_cpu
+    CUDA: tril_cuda
+    MPS: tril_mps_out
+
+- func: tril(Tensor self, int diagonal=0) -> Tensor
+  structured_delegate: tril.out
+  variants: method, function
+
+- func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CPU: tril_indices_cpu
+    CUDA: tril_indices_cuda
+  autogen: tril_indices.out
+
+- func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CPU: triu_indices_cpu
+    CUDA: triu_indices_cuda
+  autogen: triu_indices.out
+
+- func: trace(Tensor self) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU: trace_cpu
+    CUDA: trace_cuda
+    MPS: trace_mps
+  autogen: trace.out
+
+- func: trace_backward(Tensor grad, SymInt[] sizes) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: trace_backward_symint
+
+- func: ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: ne_Scalar_out
+    MPS: ne_scalar_out_mps
+    QuantizedCPU: ne_out_quantized_cpu
+  tags: pointwise
+
+- func: ne.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: ne.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: ne_quantized_cpu
+  tags: [core, pointwise]
+
+- func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: ne_Tensor_out
+    MPS: ne_tensor_out_mps
+    QuantizedCPU: ne_out_quantized_cpu
+  tags: pointwise
+
+- func: ne.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: ne.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: ne_quantized_cpu
+  tags: [core, pointwise]
+
+- func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: ne.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: ne.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+# not_equal, alias for torch.ne
+- func: not_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: not_equal.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: not_equal.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: eq_Scalar_out
+    MPS: eq_scalar_out_mps
+    QuantizedCPU: eq_out_quantized_cpu
+  tags: pointwise
+
+- func: eq.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: eq.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: eq_quantized_cpu
+    NestedTensorCPU, NestedTensorCUDA: eq_scalar_nested
+  tags: [core, pointwise]
+
+- func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: eq_Tensor_out
+    MPS: eq_tensor_out_mps
+    QuantizedCPU: eq_out_quantized_cpu
+  tags: pointwise
+
+- func: eq.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: eq.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: eq_quantized_cpu
+  tags: [core, pointwise]
+
+- func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: ge_Scalar_out
+    MPS: ge_scalar_out_mps
+    QuantizedCPU: ge_out_quantized_cpu
+  tags: pointwise
+
+- func: ge.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: ge.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: ge_quantized_cpu
+    NestedTensorCPU, NestedTensorCUDA: ge_scalar_nested
+  tags: [core, pointwise]
+
+- func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: ge_Tensor_out
+    MPS: ge_tensor_out_mps
+    QuantizedCPU: ge_out_quantized_cpu
+  tags: pointwise
+
+- func: ge.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: ge.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: ge_quantized_cpu
+  tags: [core, pointwise]
+
+- func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: ge.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: ge.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+# greater_equal, alias for torch.ge
+- func: greater_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater_equal.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater_equal.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: le_Scalar_out
+    MPS: le_scalar_out_mps
+    QuantizedCPU: le_out_quantized_cpu
+  tags: pointwise
+
+- func: le.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: le.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: le_quantized_cpu
+  tags: [core, pointwise]
+
+- func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: le_Tensor_out
+    MPS: le_tensor_out_mps
+    QuantizedCPU: le_out_quantized_cpu
+  tags: pointwise
+
+- func: le.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: le.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: le_quantized_cpu
+  tags: [core, pointwise]
+
+- func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: le.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: le.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+# less_equal, alias for torch.le
+- func: less_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less_equal.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less_equal.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: gt_Scalar_out
+    MPS: gt_scalar_out_mps
+    QuantizedCPU: gt_out_quantized_cpu
+  tags: pointwise
+
+- func: gt.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: gt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: gt_quantized_cpu
+    NestedTensorCPU, NestedTensorCUDA: gt_scalar_nested
+  tags: [core, pointwise]
+
+- func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: gt_Tensor_out
+    MPS: gt_tensor_out_mps
+    QuantizedCPU: gt_out_quantized_cpu
+  tags: pointwise
+
+- func: gt.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: gt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: gt_quantized_cpu
+  tags: [core, pointwise]
+
+- func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: gt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: gt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+#  greater, alias for torch.gt
+- func: greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: lt_Scalar_out
+    MPS: lt_scalar_out_mps
+    QuantizedCPU: lt_out_quantized_cpu
+  tags: pointwise
+
+- func: lt.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: lt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: lt_quantized_cpu
+  tags: [core, pointwise]
+
+- func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: lt_Tensor_out
+    MPS: lt_tensor_out_mps
+    QuantizedCPU: lt_out_quantized_cpu
+  tags: pointwise
+
+- func: lt.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: lt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: lt_quantized_cpu
+  tags: [core, pointwise]
+
+- func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: lt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: lt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+#  less, alias for torch.lt
+- func: less.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: take_out
+
+- func: take(Tensor self, Tensor index) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: take
+
+- func: take_along_dim.out(Tensor self, Tensor indices, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor
+  variants: method, function
+
+- func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, QuantizedCPU: index_select_out_cpu_
+    CUDA, QuantizedCUDA: index_select_out_cuda
+    MPS: index_select_out_mps
+
+- func: index_select(Tensor self, int dim, Tensor index) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU: index_select_cpu_
+    QuantizedCPU: index_select_quantized_cpu_
+    CUDA: index_select_cuda
+    QuantizedCUDA: index_select_quantized_cuda
+    SparseCPU: index_select_sparse_cpu
+    SparseCUDA: index_select_sparse_cuda
+    MPS: index_select_mps
+  tags: core
+
+- func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor
+  variants: method, function
+
+- func: index_select_backward(Tensor grad, SymInt[] self_sizes, int dim, Tensor index) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: index_select_backward_symint
+
+- func: masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: masked_select_out_cpu
+    CUDA: masked_select_out_cuda
+    MPS: masked_select_out_mps
+  tags: dynamic_output_shape
+
+- func: masked_select(Tensor self, Tensor mask) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU: masked_select_cpu
+    CUDA: masked_select_cuda
+    MPS: masked_select_mps
+  tags: dynamic_output_shape
+
+- func: masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+
+- func: nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: nonzero_out_cpu
+    CUDA: nonzero_out_cuda
+    MPS: nonzero_out_mps
+  tags: dynamic_output_shape
+
+- func: nonzero(Tensor self) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU: nonzero_cpu
+    CUDA: nonzero_cuda
+    MPS: nonzero_mps
+  tags: [dynamic_output_shape, core]
+
+- func: nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: nonzero_static_out_cpu
+
+- func: nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU: nonzero_static_cpu
+
+- func: nonzero_numpy(Tensor self) -> Tensor[]
+  variants: method, function
+
+- func: argwhere(Tensor self) -> Tensor
+  variants: method, function
+  tags: dynamic_output_shape
+
+- func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU, CUDA: gather_out
+    MPS: gather_out_mps
+
+- func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+  variants: method, function
+  structured_delegate: gather.out
+  tags: core
+
+- func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+
+- func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+
+- func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+  variants: method, function
+
+- func: _gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor
+
+- func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: addcmul_out
+    MPS: addcmul_out_mps
+  tags: pointwise
+
+- func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+  structured_delegate: addcmul.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+  structured_delegate: addcmul.out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: addcdiv_out
+    MPS: addcdiv_out_mps
+  tags: pointwise
+
+- func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+  structured_delegate: addcdiv.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+  structured_delegate: addcdiv.out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: cross_entropy_loss_symint
+
+- func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient)
+  structured: True
+  dispatch:
+    CPU, CUDA: triangular_solve_out
+    MPS: triangular_solve_mps_out
+    SparseCsrCPU: triangular_solve_out_sparse_csr_cpu
+    SparseCsrCUDA: triangular_solve_out_sparse_csr_cuda
+
+- func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
+  structured_delegate: triangular_solve.X
+  variants: method, function
+
+- func: _linalg_check_errors(Tensor info, str api_name, *, bool is_matrix) -> ()
+  dispatch:
+    CompositeExplicitAutograd: _linalg_check_errors
+
+- func: linalg_solve_triangular.out(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: linalg_solve_triangular_out
+    MPS: linalg_solve_triangular_mps_out
+
+- func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_solve_triangular
+    MPS: linalg_solve_triangular_mps
+
+- func: linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor
+  python_module: linalg
+  dispatch:
+    CompositeImplicitAutograd: linalg_vander_symint
+
+- func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)
+
+- func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)
+  variants: method, function
+
+# swapaxes, alias for transpose
+- func: swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+
+# swapdims, alias for transpose
+- func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+
+- func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cholesky_out
+
+- func: cholesky(Tensor self, bool upper=False) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: cholesky
+
+- func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: cholesky_solve_out
+
+- func: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: cholesky_solve
+
+- func: _cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _cholesky_solve_helper_cpu
+    CUDA: _cholesky_solve_helper_cuda
+  autogen: _cholesky_solve_helper.out
+
+- func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: cholesky_inverse
+
+- func: cholesky_inverse.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cholesky_inverse_out
+
+- func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+
+- func: qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)
+  variants: method, function
+
+- func: geqrf.a(Tensor self, *, Tensor(a!) a, Tensor(b!) tau) -> (Tensor(a!) a, Tensor(b!) tau)
+  dispatch:
+    CPU, CUDA: geqrf_out
+
+- func: geqrf(Tensor self) -> (Tensor a, Tensor tau)
+  variants: method, function
+  dispatch:
+    CPU, CUDA: geqrf
+
+# orgqr, alias for linalg_householder_product
+- func: orgqr(Tensor self, Tensor input2) -> Tensor
+  variants: method, function
+
+- func: orgqr.out(Tensor self, Tensor input2, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: ormqr.out(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: ormqr_out
+
+- func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: ormqr
+
+- func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor LU, Tensor pivots, Tensor info)
+  variants: function
+
+- func: lu_solve.out(Tensor self, Tensor LU_data, Tensor LU_pivots, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
+  variants: method, function
+
+# lu_unpack
+- func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)
+  structured_delegate: lu_unpack.out
+  variants: function
+
+- func: lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: lu_unpack_out
+
+# TODO: remove dispatch section when porting TH CUDA to ATen
+- func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: multinomial_out
+    MPS: multinomial_out_mps
+
+- func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: multinomial
+    MPS: multinomial_mps
+  tags: nondeterministic_seeded
+
+- func: lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: lgamma_out
+    MPS: lgamma_out_mps
+  tags: pointwise
+
+- func: lgamma_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: lgamma.out
+  variants: method
+  tags: pointwise
+
+- func: lgamma(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: lgamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: digamma_out
+    MPS: digamma_out_mps
+  tags: pointwise
+
+- func: digamma(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: digamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: polygamma_out
+    MPS: polygamma_out_mps
+  tags: pointwise
+
+- func: polygamma(int n, Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: polygamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: polygamma_
+  tags: pointwise
+
+- func: erfinv(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfinv.out
+  variants: method, function
+  dispatch:
+    SparseCPU, SparseCUDA: erfinv_sparse
+    SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr
+  tags: pointwise
+
+- func: erfinv_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfinv.out
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: erfinv_sparse_
+    SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_
+  tags: pointwise
+
+- func: erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: erfinv_out
+    MPS: erfinv_out_mps
+    SparseCPU, SparseCUDA: erfinv_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out
+  tags: pointwise
+
+- func: i0(Tensor self) -> Tensor
+  structured_delegate: i0.out
+  variants: function, method
+  tags: pointwise
+
+- func: i0_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: i0.out
+  variants: function, method
+  tags: pointwise
+
+- func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: i0_out
+  tags: pointwise
+
+- func: sign(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sign.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sign_sparse
+    SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr
+  tags: [core, pointwise]
+
+- func: sign_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sign.out
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: sign_sparse_
+    SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_
+  tags: pointwise
+
+- func: sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sign_out
+    MPS: sign_out_mps
+    SparseCPU, SparseCUDA: sign_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out
+  tags: pointwise
+
+- func: signbit(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: signbit.out
+  dispatch:
+    SparseCPU, SparseCUDA: signbit_sparse
+    SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr
+  tags: pointwise
+
+- func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU: signbit_out
+    CUDA: signbit_out
+    MPS: signbit_out_mps
+    SparseCPU, SparseCUDA: signbit_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr_out
+  tags: pointwise
+
+- func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: dist
+  autogen: dist.out
+
+- func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: atan2_out
+    MPS: atan2_out_mps
+  tags: [core, pointwise]
+
+- func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan2.out
+  variants: method
+  tags: pointwise
+
+- func: atan2(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan2.out
+  variants: method, function
+  tags: [core, pointwise]
+# arctan2, alias of atan2
+
+- func: arctan2(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: arctan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: lerp_Scalar
+    MPS: lerp_Scalar_mps
+  tags: pointwise
+
+- func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: lerp_Tensor
+    MPS: lerp_Tensor_mps
+  tags: pointwise
+
+- func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: lerp.Scalar_out
+  tags: pointwise
+
+- func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: lerp.Tensor_out
+  tags: pointwise
+
+- func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, MPS: histogram_histc_out
+    CUDA: _histc_out_cuda
+
+- func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, MPS: histogram_histc
+    CUDA: _histc_cuda
+
+- func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
+  dispatch:
+    CPU, MPS: histogram_out
+
+- func: histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
+  variants: method, function
+  dispatch:
+    CPU, MPS: histogram
+
+- func: histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
+  dispatch:
+    CPU, MPS: histogram_out
+
+- func: histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
+  variants: method, function
+  dispatch:
+    CPU, MPS: histogram
+
+- func: _histogramdd_bin_edges(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor[]
+  dispatch:
+    CPU, MPS: histogramdd_bin_edges
+  autogen: _histogramdd_bin_edges.out
+
+- func: _histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor
+  dispatch:
+    CPU, MPS: _histogramdd
+  autogen: _histogramdd_from_bin_cts.out
+
+- func: _histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor
+  dispatch:
+    CPU, MPS: _histogramdd
+  autogen: _histogramdd_from_bin_tensors.out
+
+- func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+
+- func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+
+- func: histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+
+- func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: fmod_out
+  tags: pointwise
+
+- func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: fmod
+  tags: [core, pointwise]
+
+- func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: fmod_
+  tags: pointwise
+
+- func: fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: fmod_out
+    MPS: fmod_mps_out
+  tags: pointwise
+
+- func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: fmod.Tensor_out
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: fmod.Tensor_out
+  tags: pointwise
+
+- func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: hypot_out
+    MPS: hypot_out_mps
+  tags: pointwise
+
+- func: hypot(Tensor self, Tensor other) -> Tensor
+  structured_delegate: hypot.out
+  variants: method, function
+  tags: pointwise
+
+- func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: hypot.out
+  variants: method
+  tags: pointwise
+
+- func: igamma.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: igamma_out
+  tags: pointwise
+
+- func: igamma(Tensor self, Tensor other) -> Tensor
+  structured_delegate: igamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: igamma_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: igamma.out
+  variants: method
+  tags: pointwise
+
+- func: igammac.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: igammac_out
+  tags: pointwise
+
+- func: igammac(Tensor self, Tensor other) -> Tensor
+  structured_delegate: igammac.out
+  variants: method, function
+  tags: pointwise
+
+- func: igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: igammac.out
+  variants: method
+  tags: pointwise
+
+- func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA, MPS: nextafter_out
+  tags: pointwise
+
+- func: nextafter(Tensor self, Tensor other) -> Tensor
+  structured_delegate: nextafter.out
+  variants: method, function
+  tags: pointwise
+
+- func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: nextafter.out
+  variants: method
+  tags: pointwise
+
+- func: remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: remainder_out
+  tags: pointwise
+
+- func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: remainder
+  tags: [core, pointwise]
+
+- func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: remainder_
+  tags: pointwise
+
+- func: remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: remainder_out
+    MPS: remainder_out_mps
+  tags: pointwise
+
+- func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: remainder.Tensor_out
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: remainder.Tensor_out
+  variants: method
+  tags: pointwise
+
+- func: remainder.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CPU, CUDA, MPS: remainder
+  autogen: remainder.Scalar_Tensor_out
+  tags: pointwise
+
+- func: min(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: min
+    MPS: min_mps
+    QuantizedCPU: min_quantized_cpu
+
+- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: min_unary_out
+    QuantizedCPU: min_quantized_unary_out
+
+- func: fmin(Tensor self, Tensor other) -> Tensor
+  structured_delegate: fmin.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA, MPS: fmin_out
+  tags: pointwise
+
+- func: max(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: max
+    MPS: max_mps
+    QuantizedCPU: max_quantized_cpu
+
+- func: fmax(Tensor self, Tensor other) -> Tensor
+  structured_delegate: fmax.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA, MPS: fmax_out
+  tags: pointwise
+
+- func: maximum(Tensor self, Tensor other) -> Tensor
+  structured_delegate: maximum.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: maximum_out
+    MPS: maximum_out_mps
+  tags: pointwise
+
+# binary max, alias of maximum
+# NOTE: max is not an alias for maximum, since there is also unary max
+- func: max.other(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: pointwise
+
+- func: max.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: max_unary_out
+    QuantizedCPU: max_quantized_unary_out
+
+- func: minimum(Tensor self, Tensor other) -> Tensor
+  structured_delegate: minimum.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: minimum_out
+    MPS: minimum_out_mps
+  tags: pointwise
+
+# binary min, alias for minimum
+# NOTE: min is not an alias for minimum, since there is also unary min
+- func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: pointwise
+
+- func: min.other(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+  variants: method, function
+
+- func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+
+- func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+  variants: method, function
+
+- func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+
+- func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+  variants: method, function
+
+- func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+
+- func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+  variants: method, function
+
+- func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+
+- func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: sort_out
+
+- func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  structured: True
+  dispatch:
+    CPU, CUDA: sort_stable_out
+    MPS: sort_stable_out_mps
+
+- func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: sort
+  tags: core
+
+- func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+  structured_delegate: sort.values_stable
+  variants: method, function
+  dispatch:
+    QuantizedCPU: sort_quantized_cpu_stable
+
+- func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
+  variants: method, function
+
+- func: sort.dimname_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
+  variants: method, function
+
+- func: msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: msort(Tensor self) -> Tensor
+  variants: method, function
+
+- func: argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+
+- func: argsort.stable(Tensor self, *, bool stable, int dim=-1, bool descending=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA, MPS: argsort_stable
+  autogen: argsort.stable_out
+
+- func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
+  variants: method, function
+
+- func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  structured: True
+  dispatch:
+    CPU: topk_out_cpu
+    CUDA: topk_out_cuda
+    MPS: topk_out_mps
+
+- func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+  variants: method, function
+  structured_delegate: topk.values
+  dispatch:
+    QuantizedCPU: topk_quantized_cpu
+  tags: core
+
+- func: all(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: all.all_out
+  variants: method, function
+
+- func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  structured: True
+  dispatch:
+    CPU, CUDA: all_all_out
+    MPS: all_all_out_mps
+
+- func: any(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: any.all_out
+  variants: method, function
+  dispatch:
+    SparseCPU, SparseCUDA: any_sparse
+  tags: core
+
+- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  structured: True
+  dispatch:
+    CPU, CUDA: any_all_out
+    MPS: any_all_out_mps
+
+- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: renorm_out
+    MPS: renorm_out_mps
+
+- func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: renorm.out
+
+- func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: renorm.out
+
+- func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, Meta, MPS: unfold
+    QuantizedCPU, QuantizedCUDA: unfold
+
+- func: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: unfold_backward
+  autogen: unfold_backward.out
+
+- func: equal(Tensor self, Tensor other) -> bool
+  tags: [data_dependent_output, pointwise]
+  variants: method, function
+  dispatch:
+    CPU: cpu_equal
+    CUDA: cuda_equal
+    MPS: mps_equal
+    QuantizedCPU: equal_quantized_cpu
+
+- func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: pow_Tensor_Tensor_out
+    MPS: pow_tensor_tensor_out_mps
+  tags: pointwise
+
+- func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Tensor_out
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: pow_Scalar_out
+    MPS: pow_Scalar_out_mps
+  tags: pointwise
+
+- func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Scalar_out
+  tags: [core, pointwise]
+
+- func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: pow_Tensor_Scalar_out
+    SparseCPU, SparseCUDA: pow_out_sparse_scalar
+    MPS: pow_tensor_scalar_out_mps
+  tags: pointwise
+
+- func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Scalar_out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: pow_sparse_scalar
+  tags: [core, pointwise]
+
+- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Scalar_out
+  variants: method
+  tags: pointwise
+
+- func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Tensor_out
+  variants: method
+  tags: pointwise
+
+- func: float_power.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
+
+- func: float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+  variants: function, method
+  tags: pointwise
+
+- func: float_power.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
+
+- func: float_power.Scalar(Scalar self, Tensor exponent) -> Tensor
+  tags: pointwise
+
+- func: float_power.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
+
+- func: float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+  variants: function, method
+  tags: pointwise
+
+- func: float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
+  variants: method
+  tags: pointwise
+
+- func: float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
+  variants: method
+  tags: pointwise
+
+- func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: normal_
+    MPS: normal_mps_
+    Meta: normal_meta_
+    SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: normal_nested_
+  autogen: normal.out
+
+# Only used by the functionalization pass.
+# Normally, the codegen would be able to generate a normal() NativeFunction,
+# but we can't due to overload ambiguity with normal.Tensor_float.
+- func: normal_functional(Tensor self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: normal_functional
+
+- func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: normal_out
+    MPS: normal_mps_out
+    Meta: normal_out_meta
+
+- func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
+    MPS: normal_mps
+    Meta: normal_meta
+  tags: nondeterministic_seeded
+
+- func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
+    Meta: normal_out_meta
+    MPS: normal_mps_out
+  tags: nondeterministic_seeded
+
+- func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
+    MPS: normal_mps
+    Meta: normal_meta
+  tags: nondeterministic_seeded
+
+- func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
+    Meta: normal_out_meta
+    MPS: normal_mps_out
+  tags: nondeterministic_seeded
+
+- func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
+    MPS: normal_mps
+    Meta: normal_meta
+  tags: nondeterministic_seeded
+
+- func: normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: normal
+  tags: nondeterministic_seeded
+
+- func: normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: normal_out
+  tags: nondeterministic_seeded
+
+- func: alias(Tensor(a) self) -> Tensor(a)
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: alias
+  tags: core
+
+- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
+  variants: function
+  dispatch:
+    CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
+    CPU: _amp_foreach_non_finite_check_and_unscale_cpu_
+  autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
+
+- func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CUDA: _amp_update_scale_cuda_
+    CPU: _amp_update_scale_cpu_
+  autogen: _amp_update_scale, _amp_update_scale.out
+
+    #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
+    #dispatch:
+    #CPU: _cat_cpu
+    #CUDA: cat_cuda
+    #MPS: cat_mps
+    #QuantizedCPU: cat_quantized_cpu
+
+    #- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    #dispatch:
+    #CPU: _cat_out_cpu
+  #CUDA: cat_out_cuda
+  #QuantizedCPU: cat_out_quantized_cpu
+
+- func: _foreach_add.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalar_kernel_slow
+    CUDA: foreach_tensor_add_scalar_kernel_cuda
+
+- func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalar_kernel_slow_
+    CUDA: foreach_tensor_add_scalar_kernel_cuda_
+  autogen: _foreach_add.Scalar_out
+
+- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_list_kernel_slow
+    CUDA: foreach_tensor_add_list_kernel_cuda
+
+- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_list_kernel_slow_
+    CUDA: foreach_tensor_add_list_kernel_cuda_
+  autogen: _foreach_add.List_out
+
+- func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda
+
+- func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
+  autogen: _foreach_add.ScalarList_out
+
+- func: _foreach_add.Tensor(Tensor[] self, Tensor other, *, Scalar alpha=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_tensor_kernel_slow
+    CUDA: foreach_tensor_add_tensor_kernel_cuda
+
+- func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_tensor_kernel_slow_
+    CUDA: foreach_tensor_add_tensor_kernel_cuda_
+  autogen: _foreach_add.Tensor_out
+
+- func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalar_kernel_slow
+    CUDA: foreach_tensor_sub_scalar_kernel_cuda
+
+- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalar_kernel_slow_
+    CUDA: foreach_tensor_sub_scalar_kernel_cuda_
+  autogen: _foreach_sub.Scalar_out
+
+- func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_list_kernel_slow
+    CUDA: foreach_tensor_sub_list_kernel_cuda
+
+- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_list_kernel_slow_
+    CUDA: foreach_tensor_sub_list_kernel_cuda_
+  autogen: _foreach_sub.List_out
+
+- func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
+
+- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
+  autogen: _foreach_sub.ScalarList_out
+
+- func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalar_kernel_slow
+    CUDA: foreach_tensor_mul_scalar_kernel_cuda
+
+- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalar_kernel_slow_
+    CUDA: foreach_tensor_mul_scalar_kernel_cuda_
+  autogen: _foreach_mul.Scalar_out
+
+- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_list_kernel_slow
+    CUDA: foreach_tensor_mul_list_kernel_cuda
+
+- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_list_kernel_slow_
+    CUDA: foreach_tensor_mul_list_kernel_cuda_
+  autogen: _foreach_mul.List_out
+
+- func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
+
+- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
+  autogen: _foreach_mul.ScalarList_out
+
+- func: _foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_tensor_kernel_slow
+    CUDA: foreach_tensor_mul_tensor_kernel_cuda
+
+- func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_tensor_kernel_slow_
+    CUDA: foreach_tensor_mul_tensor_kernel_cuda_
+  autogen: _foreach_mul.Tensor_out
+
+- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalar_kernel_slow
+    CUDA: foreach_tensor_div_scalar_kernel_cuda
+
+- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalar_kernel_slow_
+    CUDA: foreach_tensor_div_scalar_kernel_cuda_
+  autogen: _foreach_div.Scalar_out
+
+- func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_list_kernel_slow
+    CUDA: foreach_tensor_div_list_kernel_cuda
+
+- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_list_kernel_slow_
+    CUDA: foreach_tensor_div_list_kernel_cuda_
+  autogen: _foreach_div.List_out
+
+- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda
+
+- func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
+  autogen: _foreach_div.ScalarList_out
+
+- func: _foreach_div.Tensor(Tensor[] self, Tensor other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_tensor_kernel_slow
+    CUDA: foreach_tensor_div_tensor_kernel_cuda
+
+- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_tensor_kernel_slow_
+    CUDA: foreach_tensor_div_tensor_kernel_cuda_
+  autogen: _foreach_div.Tensor_out
+
+- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalar_kernel_slow
+    CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
+
+- func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
+  autogen: _foreach_clamp_max.Scalar_out
+
+- func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_list_kernel_slow
+    CUDA: foreach_tensor_clamp_max_list_kernel_cuda
+
+- func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_list_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
+  autogen: _foreach_clamp_max.List_out
+
+- func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
+    CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
+
+- func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
+  autogen: _foreach_clamp_max.ScalarList_out
+
+- func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalar_kernel_slow
+    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
+
+- func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+  autogen: _foreach_clamp_min.Scalar_out
+
+- func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_list_kernel_slow
+    CUDA: foreach_tensor_clamp_min_list_kernel_cuda
+
+- func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_list_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
+  autogen: _foreach_clamp_min.List_out
+
+- func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
+    CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
+
+- func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
+  autogen: _foreach_clamp_min.ScalarList_out
+
+# foreach_minimum/maximum dispatches to clamp_max/min
+- func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalar_kernel_slow
+    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
+
+- func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+  autogen: _foreach_maximum.Scalar_out
+
+# foreach_minimum/maximum dispatches to clamp_max/min
+- func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_list_kernel_slow
+    CUDA: foreach_tensor_clamp_min_list_kernel_cuda
+
+- func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_list_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
+  autogen: _foreach_maximum.List_out
+
+# foreach_minimum/maximum dispatches to clamp_max/min
+- func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
+    CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
+
+- func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
+  autogen: _foreach_maximum.ScalarList_out
+
+- func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalar_kernel_slow
+    CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
+
+- func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
+  autogen: _foreach_minimum.Scalar_out
+
+- func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_list_kernel_slow
+    CUDA: foreach_tensor_clamp_max_list_kernel_cuda
+
+- func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_list_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
+  autogen: _foreach_minimum.List_out
+
+- func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
+    CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
+
+- func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
+  autogen: _foreach_minimum.ScalarList_out
+
+- func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_scalar_slow
+    CUDA: foreach_tensor_addcdiv_scalar_cuda
+
+- func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_scalarlist_slow
+    CUDA: foreach_tensor_addcdiv_scalarlist_cuda
+
+- func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_tensor_slow
+    CUDA: foreach_tensor_addcdiv_tensor_cuda
+
+- func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_scalar_slow_
+    CUDA: foreach_tensor_addcdiv_scalar_cuda_
+  autogen: _foreach_addcdiv.Scalar_out
+
+- func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_scalarlist_slow_
+    CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
+  autogen: _foreach_addcdiv.ScalarList_out
+
+- func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_tensor_slow_
+    CUDA: foreach_tensor_addcdiv_tensor_cuda_
+  autogen: _foreach_addcdiv.Tensor_out
+
+- func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_scalar_slow
+    CUDA: foreach_tensor_addcmul_scalar_cuda
+
+- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_scalarlist_slow
+    CUDA: foreach_tensor_addcmul_scalarlist_cuda
+
+- func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_tensor_slow
+    CUDA: foreach_tensor_addcmul_tensor_cuda
+
+- func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_scalar_slow_
+    CUDA: foreach_tensor_addcmul_scalar_cuda_
+  autogen: _foreach_addcmul.Scalar_out
+
+- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_scalarlist_slow_
+    CUDA: foreach_tensor_addcmul_scalarlist_cuda_
+  autogen: _foreach_addcmul.ScalarList_out
+
+- func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_tensor_slow_
+    CUDA: foreach_tensor_addcmul_tensor_cuda_
+  autogen: _foreach_addcmul.Tensor_out
+
+- func: _foreach_abs(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_abs_slow
+    CUDA: foreach_tensor_abs_cuda
+
+- func: _foreach_abs_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_abs_slow_
+    CUDA: foreach_tensor_abs_cuda_
+  autogen: _foreach_abs.out
+
+- func: _foreach_acos(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_acos_slow
+    CUDA: foreach_tensor_acos_cuda
+
+- func: _foreach_acos_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_acos_slow_
+    CUDA: foreach_tensor_acos_cuda_
+  autogen: _foreach_acos.out
+
+- func: _foreach_asin(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_asin_slow
+    CUDA: foreach_tensor_asin_cuda
+
+- func: _foreach_asin_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_asin_slow_
+    CUDA: foreach_tensor_asin_cuda_
+  autogen: _foreach_asin.out
+
+- func: _foreach_atan(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_atan_slow
+    CUDA: foreach_tensor_atan_cuda
+
+- func: _foreach_atan_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_atan_slow_
+    CUDA: foreach_tensor_atan_cuda_
+  autogen: _foreach_atan.out
+
+- func: _foreach_ceil(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_ceil_slow
+    CUDA: foreach_tensor_ceil_cuda
+
+- func: _foreach_ceil_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_ceil_slow_
+    CUDA: foreach_tensor_ceil_cuda_
+  autogen: _foreach_ceil.out
+
+- func: _foreach_cos(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_cos_slow
+    CUDA: foreach_tensor_cos_cuda
+
+- func: _foreach_cos_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_cos_slow_
+    CUDA: foreach_tensor_cos_cuda_
+  autogen: _foreach_cos.out
+
+- func: _foreach_cosh(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_cosh_slow
+    CUDA: foreach_tensor_cosh_cuda
+
+- func: _foreach_cosh_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_cosh_slow_
+    CUDA: foreach_tensor_cosh_cuda_
+  autogen: _foreach_cosh.out
+
+- func: _foreach_erf(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_erf_slow
+    CUDA: foreach_tensor_erf_cuda
+
+- func: _foreach_erf_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_erf_slow_
+    CUDA: foreach_tensor_erf_cuda_
+  autogen: _foreach_erf.out
+
+- func: _foreach_erfc(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_erfc_slow
+    CUDA: foreach_tensor_erfc_cuda
+
+- func: _foreach_erfc_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_erfc_slow_
+    CUDA: foreach_tensor_erfc_cuda_
+  autogen: _foreach_erfc.out
+
+- func: _foreach_exp(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_exp_slow
+    CUDA: foreach_tensor_exp_cuda
+
+- func: _foreach_exp_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_exp_slow_
+    CUDA: foreach_tensor_exp_cuda_
+  autogen: _foreach_exp.out
+
+- func: _foreach_expm1(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_expm1_slow
+    CUDA: foreach_tensor_expm1_cuda
+
+- func: _foreach_expm1_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_expm1_slow_
+    CUDA: foreach_tensor_expm1_cuda_
+  autogen: _foreach_expm1.out
+
+- func: _foreach_floor(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_floor_slow
+    CUDA: foreach_tensor_floor_cuda
+
+- func: _foreach_floor_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_floor_slow_
+    CUDA: foreach_tensor_floor_cuda_
+  autogen: _foreach_floor.out
+
+- func: _foreach_frac(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_frac_slow
+    CUDA: foreach_tensor_frac_cuda
+
+- func: _foreach_frac_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_frac_slow_
+    CUDA: foreach_tensor_frac_cuda_
+  autogen: _foreach_frac.out
+
+- func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_ternary_lerp_slow
+    CUDA: foreach_tensor_lerp_ternary_cuda
+  autogen: _foreach_lerp.List_out
+
+- func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_ternary_lerp_slow_
+    CUDA: foreach_tensor_lerp_ternary_cuda_
+  autogen: _foreach_lerp.List_out
+
+- func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_lerp_list_kernel_slow
+    CUDA: foreach_tensor_lerp_list_cuda
+  autogen: _foreach_lerp.Scalar_out
+
+- func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_lerp_list_kernel_slow_
+    CUDA: foreach_tensor_lerp_list_cuda_
+  autogen: _foreach_lerp.Scalar_out
+
+- func: _foreach_lgamma(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_lgamma_slow
+    CUDA: foreach_tensor_lgamma_cuda
+
+- func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_lgamma_slow_
+    CUDA: foreach_tensor_lgamma_cuda_
+  autogen: _foreach_lgamma.out
+
+- func: _foreach_log(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log_slow
+    CUDA: foreach_tensor_log_cuda
+
+- func: _foreach_log_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log_slow_
+    CUDA: foreach_tensor_log_cuda_
+  autogen: _foreach_log.out
+
+- func: _foreach_log10(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log10_slow
+    CUDA: foreach_tensor_log10_cuda
+
+- func: _foreach_log10_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log10_slow_
+    CUDA: foreach_tensor_log10_cuda_
+  autogen: _foreach_log10.out
+
+- func: _foreach_log1p(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log1p_slow
+    CUDA: foreach_tensor_log1p_cuda
+
+- func: _foreach_log1p_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log1p_slow_
+    CUDA: foreach_tensor_log1p_cuda_
+  autogen: _foreach_log1p.out
+
+- func: _foreach_log2(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log2_slow
+    CUDA: foreach_tensor_log2_cuda
+
+- func: _foreach_log2_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log2_slow_
+    CUDA: foreach_tensor_log2_cuda_
+  autogen: _foreach_log2.out
+
+- func: _foreach_neg(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_neg_slow
+    CUDA: foreach_tensor_neg_cuda
+
+- func: _foreach_neg_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_neg_slow_
+    CUDA: foreach_tensor_neg_cuda_
+  autogen: _foreach_neg.out
+
+- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_norm_slow
+    CUDA: foreach_tensor_norm_cuda
+  autogen: _foreach_norm.Scalar_out
+
+- func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_list_kernel_slow
+    CUDA: foreach_tensor_pow_list_kernel_cuda
+
+- func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalar_kernel_slow
+    CUDA: foreach_tensor_pow_scalar_kernel_cuda
+
+- func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalarlist_kernel_slow
+    CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
+
+- func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_scalar_pow_list_kernel_slow
+    CUDA: foreach_scalar_pow_list_kernel_cuda
+
+- func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_list_kernel_slow_
+    CUDA: foreach_tensor_pow_list_kernel_cuda_
+  autogen: _foreach_pow.List_out
+
+- func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalar_kernel_slow_
+    CUDA: foreach_tensor_pow_scalar_kernel_cuda_
+  autogen: _foreach_pow.Scalar_out
+
+- func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
+  autogen: _foreach_pow.ScalarList_out
+
+- func: _foreach_reciprocal(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_reciprocal_slow
+    CUDA: foreach_tensor_reciprocal_cuda
+
+- func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_reciprocal_slow_
+    CUDA: foreach_tensor_reciprocal_cuda_
+  autogen: _foreach_reciprocal.out
+
+- func: _foreach_round(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_round_slow
+    CUDA: foreach_tensor_round_cuda
+
+- func: _foreach_round_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_round_slow_
+    CUDA: foreach_tensor_round_cuda_
+  autogen: _foreach_round.out
+
+- func: _foreach_sigmoid(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sigmoid_slow
+    CUDA: foreach_tensor_sigmoid_cuda
+
+- func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sigmoid_slow_
+    CUDA: foreach_tensor_sigmoid_cuda_
+  autogen: _foreach_sigmoid.out
+
+- func: _foreach_sign(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sign_slow
+    CUDA: foreach_tensor_sign_cuda
+
+- func: _foreach_sign_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sign_slow_
+    CUDA: foreach_tensor_sign_cuda_
+  autogen: _foreach_sign.out
+
+- func: _foreach_sin(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sin_slow
+    CUDA: foreach_tensor_sin_cuda
+
+- func: _foreach_sin_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sin_slow_
+    CUDA: foreach_tensor_sin_cuda_
+  autogen: _foreach_sin.out
+
+- func: _foreach_sinh(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sinh_slow
+    CUDA: foreach_tensor_sinh_cuda
+
+- func: _foreach_sinh_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sinh_slow_
+    CUDA: foreach_tensor_sinh_cuda_
+  autogen: _foreach_sinh.out
+
+- func: _foreach_sqrt(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sqrt_slow
+    CUDA: foreach_tensor_sqrt_cuda
+
+- func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sqrt_slow_
+    CUDA: foreach_tensor_sqrt_cuda_
+  autogen: _foreach_sqrt.out
+
+- func: _foreach_tan(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_tan_slow
+    CUDA: foreach_tensor_tan_cuda
+
+- func: _foreach_tan_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_tan_slow_
+    CUDA: foreach_tensor_tan_cuda_
+  autogen: _foreach_tan.out
+
+- func: _foreach_tanh(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_tanh_slow
+    CUDA: foreach_tensor_tanh_cuda
+
+- func: _foreach_tanh_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_tanh_slow_
+    CUDA: foreach_tensor_tanh_cuda_
+  autogen: _foreach_tanh.out
+
+- func: _foreach_trunc(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_trunc_slow
+    CUDA: foreach_tensor_trunc_cuda
+
+- func: _foreach_trunc_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_trunc_slow_
+    CUDA: foreach_tensor_trunc_cuda_
+  autogen: _foreach_trunc.out
+
+- func: _foreach_zero_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_zero_slow_
+    CUDA: foreach_tensor_zero_cuda_
+  autogen: _foreach_zero, _foreach_zero.out
+
+- func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_copy_list_kernel_slow_
+    CUDA: foreach_tensor_copy_list_kernel_cuda_
+  autogen: _foreach_copy, _foreach_copy.out
+
+- func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
+  dispatch:
+    CPU: bucketize_cpu
+    CUDA: bucketize_cuda
+    MPS: bucketize_mps
+
+- func: bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: bucketize_out_cpu
+    CUDA: bucketize_out_cuda
+    MPS: bucketize_out_mps
+
+- func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
+  dispatch:
+    CPU: bucketize_cpu
+    CUDA: bucketize_cuda
+    MPS: bucketize_mps
+  autogen: bucketize.Scalar_out
+
+- func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor
+  dispatch:
+    CPU: searchsorted_cpu
+    CUDA: searchsorted_cuda
+    MPS: searchsorted_mps
+
+- func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: searchsorted_out_cpu
+    CUDA: searchsorted_out_cuda
+    MPS: searchsorted_out_mps
+
+- func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor
+  dispatch:
+    CPU: searchsorted_cpu
+    CUDA: searchsorted_cuda
+    MPS: searchsorted_mps
+
+- func: searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: searchsorted_out_cpu
+    CUDA: searchsorted_out_cuda
+    MPS: searchsorted_out_mps
+
+- func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor
+  structured_delegate: _convert_indices_from_coo_to_csr.out
+
+- func: _convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: _convert_indices_from_coo_to_csr_structured_cpu
+    CUDA: _convert_indices_from_coo_to_csr_structured_cuda
+
+- func: _convert_indices_from_csr_to_coo(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False) -> Tensor
+  structured_delegate: _convert_indices_from_csr_to_coo.out
+
+- func: _convert_indices_from_csr_to_coo.out(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: _convert_indices_from_csr_to_coo_structured_cpu
+    CUDA: _convert_indices_from_csr_to_coo_structured_cuda
+
+## NN wrappers
+
+- func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss_out
+    MPS: mse_loss_out_mps
+
+- func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mse_loss.out
+  python_module: nn
+
+- func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss_backward_out
+    MPS: mse_loss_backward_out_mps
+
+- func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss_backward
+    MPS: mse_loss_backward_mps
+
+- func: l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  python_module: nn
+
+- func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: multi_margin_loss_cpu_out
+    CUDA: multi_margin_loss_cuda_out
+
+- func: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: multi_margin_loss_cpu
+    CUDA: multi_margin_loss_cuda
+
+- func: multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: multi_margin_loss_cpu_backward_out
+    CUDA: multi_margin_loss_cuda_backward_out
+
+- func: multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: multi_margin_loss_cpu_backward
+    CUDA: multi_margin_loss_cuda_backward
+
+- func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  python_module: nn
+
+- func: multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  dispatch:
+    CPU: multilabel_margin_loss_forward_out_cpu
+    CUDA: multilabel_margin_loss_forward_out_cuda
+
+- func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target)
+  python_module: nn
+  dispatch:
+    CPU: multilabel_margin_loss_forward_cpu
+    CUDA: multilabel_margin_loss_forward_cuda
+
+- func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: multilabel_margin_loss_backward_cpu_out
+    CUDA: multilabel_margin_loss_backward_cuda_out
+
+- func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: multilabel_margin_loss_backward_cpu
+    CUDA: multilabel_margin_loss_backward_cuda
+
+- func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: nll_loss_nd(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: nll_loss_nd_symint
+
+- func: nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: nll_loss_symint
+
+- func: nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: nll_loss_forward_out_cpu
+    CUDA: nll_loss_forward_out_cuda
+    MPS: nll_loss_forward_out_mps
+
+- func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+  python_module: nn
+  structured_delegate: nll_loss_forward.output
+
+- func: nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: nll_loss_backward_out_cpu
+    CUDA: nll_loss_backward_out_cuda
+    MPS: nll_loss_backward_out_mps
+
+- func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+  python_module: nn
+  structured_delegate: nll_loss_backward.grad_input
+
+- func: nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: nll_loss2d_symint
+
+- func: nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  dispatch:
+    CPU: nll_loss2d_forward_out_cpu
+    CUDA: nll_loss2d_forward_out_cuda
+    MPS: nll_loss2d_forward_out_mps
+
+- func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+  python_module: nn
+  dispatch:
+    CPU: nll_loss2d_forward_cpu
+    CUDA: nll_loss2d_forward_cuda
+    MPS: nll_loss2d_forward_mps
+
+- func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: nll_loss2d_backward_out_cpu
+    CUDA: nll_loss2d_backward_out_cuda
+    MPS: nll_loss2d_backward_out_mps
+
+- func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: nll_loss2d_backward_cpu
+    CUDA: nll_loss2d_backward_cuda
+    MPS: nll_loss2d_backward_mps
+
+- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: smooth_l1_loss_out
+    MPS: smooth_l1_loss_out_mps
+
+- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: smooth_l1_loss.out
+  python_module: nn
+
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: smooth_l1_loss_backward_out
+    CUDA: smooth_l1_loss_backward_out
+    MPS: smooth_l1_loss_backward_out_mps
+
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: smooth_l1_loss_backward
+
+- func: huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, CUDA: huber_loss_out
+    MPS: huber_loss_out_mps
+
+- func: huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: huber_loss
+    MPS: huber_loss_mps
+
+- func: huber_loss_backward.out(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, CUDA: huber_loss_backward_out
+    MPS: huber_loss_backward_out_mps
+
+- func: huber_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: huber_loss_backward
+
+- func: soft_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: soft_margin_loss_out
+
+- func: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: soft_margin_loss
+
+- func: soft_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: soft_margin_loss_backward_out
+
+- func: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: soft_margin_loss_backward
+
+- func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: elu_out
+    MPS: elu_out_mps
+
+- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
+  structured_delegate: elu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: elu_backward_out
+    MPS: elu_backward_out_mps
+
+- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
+  structured_delegate: elu_backward.grad_input
+  python_module: nn
+
+- func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
+  structured_delegate: elu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: glu_out
+    MPS: glu_out_mps
+
+- func: glu(Tensor self, int dim=-1) -> Tensor
+  structured_delegate: glu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: glu_backward_cpu_out
+    CUDA: glu_backward_cuda_out
+    MPS: glu_backward_mps_out
+
+- func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: glu_backward_cpu
+    CUDA: glu_backward_cuda
+    MPS: glu_backward_mps
+
+- func: glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: glu_jvp
+  autogen: glu_jvp.out
+
+- func: glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: glu_backward_jvp
+  autogen: glu_backward_jvp.out
+
+- func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_out
+    MPS: hardsigmoid_out_mps
+    QuantizedCPU: hardsigmoid_out_quantized_cpu
+
+- func: hardsigmoid(Tensor self) -> Tensor
+  structured_delegate: hardsigmoid.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    QuantizedCPU: hardsigmoid_quantized_cpu
+
+- func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: hardsigmoid.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_backward_out
+    MPS: hardsigmoid_backward_out_mps
+
+- func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
+  structured_delegate: hardsigmoid_backward.grad_input
+  python_module: nn
+
+- func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA, MPS: hardtanh_out
+    QuantizedCPU: hardtanh_out_quantized_cpu
+
+- func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA, MPS: hardtanh
+    QuantizedCPU: hardtanh_quantized_cpu
+  tags: core
+
+- func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardtanh_backward_out
+    MPS: hardtanh_backward_out_mps
+
+- func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardtanh_backward
+    MPS: hardtanh_backward_mps
+
+- func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA, MPS: hardtanh_
+    QuantizedCPU: hardtanh_quantized_cpu_
+
+- func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_out
+    MPS: hardswish_out_mps
+
+- func: hardswish(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish
+    MPS: hardswish_mps
+
+- func: hardswish_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_
+    MPS: hardswish_mps_
+
+- func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_backward
+    MPS: hardswish_backward_mps
+  autogen: hardswish_backward.out
+
+- func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: leaky_relu_out
+    MPS: leaky_relu_out_mps
+    QuantizedCPU: leaky_relu_out_quantized_cpu
+
+- func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
+  structured_delegate: leaky_relu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    QuantizedCPU: leaky_relu_quantized_cpu
+  tags: core
+
+- func: leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: leaky_relu_backward_out
+    MPS: leaky_relu_backward_out_mps
+
+- func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
+  structured_delegate: leaky_relu_backward.grad_input
+  python_module: nn
+
+- func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
+  structured_delegate: leaky_relu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    QuantizedCPU: leaky_relu_quantized_cpu_
+
+- func: log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: log_sigmoid(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!))
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU: log_sigmoid_forward_out_cpu
+    CUDA: log_sigmoid_forward_out_cuda
+    MPS: log_sigmoid_forward_out_mps
+
+- func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU: log_sigmoid_forward_cpu
+    CUDA: log_sigmoid_forward_cuda
+    MPS: log_sigmoid_forward_mps
+
+- func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: log_sigmoid_backward_cpu_out
+    CUDA: log_sigmoid_backward_cuda_out
+    MPS: log_sigmoid_backward_mps_out
+
+- func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: log_sigmoid_backward_cpu
+    CUDA: log_sigmoid_backward_cuda
+    MPS: log_sigmoid_backward_mps
+
+- func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU: rrelu_with_noise_out_cpu
+    CUDA: rrelu_with_noise_out_cuda
+
+- func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: rrelu_with_noise_cpu
+    CUDA: rrelu_with_noise_cuda
+  tags: nondeterministic_seeded
+
+- func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: rrelu_with_noise_backward
+  autogen: rrelu_with_noise_backward.out
+
+- func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
+  python_module: nn
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU: rrelu_with_noise_cpu_
+    CUDA: rrelu_with_noise_cuda_
+
+- func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_out
+    MPS: softplus_out_mps
+
+- func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
+  structured_delegate: softplus.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_backward_out
+    MPS: softplus_backward_out_mps
+
+- func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor
+  structured_delegate: softplus_backward.grad_input
+  python_module: nn
+
+- func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_out
+    MPS: softshrink_out_mps
+
+- func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+  structured_delegate: softshrink.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_backward_out
+    MPS: softshrink_backward_out_mps
+
+- func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
+  structured_delegate: softshrink_backward.grad_input
+  python_module: nn
+
+- func: adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool2d_out_cpu
+    CUDA: adaptive_avg_pool2d_out_cuda
+    MPS: adaptive_avg_pool2d_out_mps
+    MkldnnCPU: mkldnn_adaptive_avg_pool2d_out_stub
+
+- func: adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: adaptive_avg_pool2d_symint
+
+- func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_adaptive_avg_pool2d
+
+- func: mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
+
+- func: mkldnn_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_adaptive_avg_pool2d_backward
+  autogen: mkldnn_adaptive_avg_pool2d_backward.out
+
+- func: _adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+  dispatch:
+    CPU: adaptive_avg_pool2d_cpu
+    CUDA: adaptive_avg_pool2d_cuda
+    MPS: adaptive_avg_pool2d_mps
+    QuantizedCPU: adaptive_avg_pool2d_quantized_cpu
+    QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda
+  autogen: _adaptive_avg_pool2d.out
+  tags: core
+
+- func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool2d_backward_cpu
+    CUDA: adaptive_avg_pool2d_backward_cuda
+    MPS: adaptive_avg_pool2d_backward_mps
+  autogen: _adaptive_avg_pool2d_backward.out
+  tags: core
+
+- func: adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool3d_out_cpu
+    CUDA: adaptive_avg_pool3d_out_cuda
+    QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu
+
+- func: adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: adaptive_avg_pool3d_symint
+
+- func: _adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+  dispatch:
+    CPU: adaptive_avg_pool3d_cpu
+    CUDA: adaptive_avg_pool3d_cuda
+    QuantizedCPU: adaptive_avg_pool3d_quantized_cpu
+  autogen: _adaptive_avg_pool3d.out
+  tags: core
+
+- func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool3d_backward_out_cpu
+    CUDA: adaptive_avg_pool3d_backward_out_cuda
+
+- func: _adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool3d_backward_cpu
+    CUDA: adaptive_avg_pool3d_backward_cuda
+  autogen: _adaptive_avg_pool3d_backward.out
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: adaptive_max_pool2d_out_cpu
+    CUDA: adaptive_max_pool2d_out_cuda
+    MPS: adaptive_max_pool2d_out_mps
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: adaptive_max_pool2d.out
+
+- func: adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: adaptive_max_pool2d_backward_out_cpu
+    CUDA: adaptive_max_pool2d_backward_out_cuda
+    MPS: adaptive_max_pool2d_backward_out_mps
+
+- func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+  python_module: nn
+  structured_delegate: adaptive_max_pool2d_backward.grad_input
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: adaptive_max_pool3d_out_cpu
+    CUDA: adaptive_max_pool3d_out_cuda
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: adaptive_max_pool3d.out
+
+- func: adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: adaptive_max_pool3d_backward_out_cpu
+    CUDA: adaptive_max_pool3d_backward_out_cuda
+
+- func: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+  python_module: nn
+  structured_delegate: adaptive_max_pool3d_backward.grad_input
+
+- func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  precomputed:
+  - kernel_size -> int kH, int kW
+  - stride -> int dH, int dW
+  - padding -> int padH, int padW
+  dispatch:
+    CPU: avg_pool2d_out_cpu
+    CUDA: avg_pool2d_out_cuda
+    MPS: avg_pool2d_out_mps
+    MkldnnCPU: mkldnn_avg_pool2d_out
+
+- func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+  python_module: nn
+  structured_delegate: avg_pool2d.out
+  dispatch:
+    MkldnnCPU: mkldnn_avg_pool2d
+    QuantizedCPU: avg_pool2d_quantized_cpu
+  tags: core
+
+- func: avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: avg_pool2d_backward_out_cpu
+    CUDA: avg_pool2d_backward_out_cuda
+    MPS: avg_pool2d_backward_out_mps
+    MkldnnCPU: mkldnn_avg_pool2d_backward_out
+
+- func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
+  python_module: nn
+  structured_delegate: avg_pool2d_backward.grad_input
+  dispatch:
+    MkldnnCPU: mkldnn_avg_pool2d_backward
+  tags: core
+
+- func: avg_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: avg_pool3d_out_cpu
+    CUDA: avg_pool3d_out_cuda
+    MkldnnCPU: mkldnn_avg_pool3d_out
+
+- func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+  python_module: nn
+  structured_delegate: avg_pool3d.out
+  dispatch:
+    MkldnnCPU: mkldnn_avg_pool3d
+    QuantizedCPU: avg_pool3d_quantized_cpu
+  tags: core
+
+- func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: avg_pool3d_backward_out_cpu
+    CUDA: avg_pool3d_backward_out_cuda
+    MkldnnCPU: mkldnn_avg_pool3d_backward_out
+
+- func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
+  python_module: nn
+  structured_delegate: avg_pool3d_backward.grad_input
+  dispatch:
+    MkldnnCPU: mkldnn_avg_pool3d_backward
+
+# Return: (Tensor output, Tensor indices)
+- func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: fractional_max_pool2d_out_cpu
+    CUDA: fractional_max_pool2d_out_cuda
+
+# Return: (Tensor output, Tensor indices)
+- func: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: fractional_max_pool2d.output
+
+- func: fractional_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: fractional_max_pool2d_backward_cpu
+    CUDA: fractional_max_pool2d_backward_cuda
+
+- func: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor
+  python_module: nn
+  structured_delegate: fractional_max_pool2d_backward.grad_input
+
+# Return: (Tensor output, Tensor indices)
+- func: fractional_max_pool3d.output(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  precomputed:
+  - kernel_size -> int poolSizeT, int poolSizeH, int poolSizeW
+  - output_size -> int outputT, int outputH, int outputW
+  - int numBatch, int numPlanes, int inputT, int inputH, int inputW
+  dispatch:
+    CPU: fractional_max_pool3d_out_cpu
+    CUDA: fractional_max_pool3d_out_cuda
+
+# Return: (Tensor output, Tensor indices)
+- func: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: fractional_max_pool3d.output
+
+- func: fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: fractional_max_pool3d_backward_out_cpu
+    CUDA: fractional_max_pool3d_backward_out_cuda
+
+- func: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: fractional_max_pool3d_backward_cpu
+    CUDA: fractional_max_pool3d_backward_cuda
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: max_pool2d_with_indices_out_cpu
+    CUDA: max_pool2d_with_indices_out_cuda
+    MPS: max_pool2d_with_indices_out_mps
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: max_pool2d_with_indices.out
+  tags: core
+
+- func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: max_pool2d_with_indices_backward_out_cpu
+    CUDA: max_pool2d_with_indices_backward_out_cuda
+    MPS: max_pool2d_with_indices_backward_out_mps
+
+- func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
+  python_module: nn
+  structured_delegate: max_pool2d_with_indices_backward.grad_input
+  tags: core
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  dispatch:
+    CPU: max_pool3d_with_indices_out_cpu
+    CUDA: max_pool3d_with_indices_out_cuda
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+  python_module: nn
+  dispatch:
+    CPU: max_pool3d_with_indices_cpu
+    CUDA: max_pool3d_with_indices_cuda
+  tags: core
+
+- func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: max_pool3d_with_indices_backward_out_cpu
+    CUDA: max_pool3d_with_indices_backward_out_cuda
+
+- func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: max_pool3d_with_indices_backward_cpu
+    CUDA: max_pool3d_with_indices_backward_cuda
+
+- func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: max_unpooling2d_forward_out_cpu
+    CUDA: max_unpooling2d_forward_out_cuda
+
+- func: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: max_unpooling2d_forward_cpu
+    CUDA: max_unpooling2d_forward_cuda
+
+- func: max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: max_unpooling3d_forward_out_cpu
+    CUDA: max_unpooling3d_forward_out_cuda
+
+- func: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: max_unpooling3d_forward_cpu
+    CUDA: max_unpooling3d_forward_cuda
+
+- func: reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: reflection_pad1d_out_cpu
+    QuantizedCPU: reflection_pad1d_out_quantized_cpu
+    CUDA: reflection_pad1d_out_cuda
+    MPS: reflection_pad1d_out_mps
+
+- func: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad1d.out
+  tags: core
+
+- func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: reflection_pad1d_backward_out_cpu
+    CUDA: reflection_pad1d_backward_out_cuda
+    MPS: reflection_pad1d_backward_out_mps
+
+- func: reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad1d_backward.grad_input
+
+- func: reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, QuantizedCPU: reflection_pad2d_out_cpu
+    CUDA: reflection_pad2d_out_cuda
+    MPS: reflection_pad2d_out_mps
+
+- func: reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: reflection_pad2d_cpu
+    QuantizedCPU: reflection_pad2d_quantized_cpu
+    CUDA: reflection_pad2d_cuda
+    MPS: reflection_pad2d_mps
+  tags: core
+
+- func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: reflection_pad2d_backward_out_cpu
+    CUDA: reflection_pad2d_backward_out_cuda
+    MPS: reflection_pad2d_backward_out_mps
+
+- func: reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: reflection_pad2d_backward_cpu
+    CUDA: reflection_pad2d_backward_cuda
+    MPS: reflection_pad2d_backward_mps
+
+- func: reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: reflection_pad3d_out_cpu
+    CUDA: reflection_pad3d_out_cuda
+    MPS: reflection_pad3d_out_mps
+
+- func: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad3d.out
+  tags: core
+
+- func: reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: reflection_pad3d_backward_out_cpu
+    CUDA: reflection_pad3d_backward_out_cuda
+    MPS: reflection_pad3d_backward_out_mps
+
+- func: reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad3d_backward.grad_input
+
+- func: replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: replication_pad1d_out_cpu
+    CUDA: replication_pad1d_out_cuda
+    MPS: replication_pad1d_out_mps
+
+- func: replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad1d.out
+
+- func: replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: replication_pad1d_backward_out_cpu
+    CUDA: replication_pad1d_backward_out_cuda
+    MPS: replication_pad1d_backward_out_mps
+
+- func: replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad1d_backward.grad_input
+
+- func: replication_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: replication_pad2d_out_cpu
+    CUDA: replication_pad2d_out_cuda
+    MPS: replication_pad2d_out_mps
+
+- func: replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad2d.out
+  tags: core
+
+- func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: replication_pad2d_backward_out_cpu
+    CUDA: replication_pad2d_backward_out_cuda
+    MPS: replication_pad2d_backward_out_mps
+
+- func: replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: replication_pad2d_backward_cpu
+    CUDA: replication_pad2d_backward_cuda
+    MPS: replication_pad2d_backward_mps
+
+- func: replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: replication_pad3d_out_cpu
+    CUDA: replication_pad3d_out_cuda
+    MPS: replication_pad3d_out_mps
+
+- func: replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad3d.out
+  tags: core
+
+
+- func: replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: replication_pad3d_backward_out_cpu
+    CUDA: replication_pad3d_backward_out_cuda
+    MPS: replication_pad3d_backward_out_mps
+
+- func: replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: replication_pad3d_backward_cpu
+    CUDA: replication_pad3d_backward_cuda
+    MPS: replication_pad3d_backward_mps
+
+- func: _pad_circular(Tensor self, SymInt[] pad) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: _pad_circular_symint
+
+- func: _pad_enum(Tensor self, SymInt[] pad, int mode, float? value=None) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: _pad_enum_symint
+
+- func: pad(Tensor self, SymInt[] pad, str mode="constant", float? value=None) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: pad_symint
+
+- func: upsample_linear1d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_linear1d.vec_out
+
+- func: upsample_bilinear2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_bilinear2d.vec_out
+  tags: core
+
+- func: _upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_bilinear2d_aa.vec_out
+
+- func: upsample_trilinear3d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_trilinear3d.vec_out
+
+- func: upsample_bicubic2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_bicubic2d.vec_out
+
+- func: _upsample_bicubic2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_bicubic2d_aa.vec_out
+
+- func: upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_nearest1d.vec_out
+
+- func: _upsample_nearest_exact1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_nearest_exact1d.vec_out
+
+- func: upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_nearest2d.vec_out
+  tags: core
+
+- func: _upsample_nearest_exact2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_nearest_exact2d.vec_out
+
+- func: upsample_nearest3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_nearest3d.vec_out
+
+- func: _upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_nearest_exact3d.vec_out
+
+# NOTE: all of the non-"vec" upsample overloads are only kept for backward compatibility.
+- func: upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_linear1d_out_cpu
+    CUDA: upsample_linear1d_out_cuda
+    MPS: upsample_linear1d_out_mps
+
+- func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_linear1d.out
+
+- func: upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_linear1d_backward_out_cpu
+    CUDA: upsample_linear1d_backward_out_cuda
+    MPS: upsample_linear1d_backward_out_mps
+
+- func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_linear1d_backward.grad_input
+
+- func: upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_bilinear2d_out_cpu
+    CUDA: upsample_bilinear2d_out_cuda
+    MPS: upsample_bilinear2d_out_mps
+
+- func: upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bilinear2d.out
+  dispatch:
+    QuantizedCPU: upsample_bilinear2d_quantized_cpu
+
+- func: upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_bilinear2d_backward_out_cpu
+    CUDA: upsample_bilinear2d_backward_out_cuda
+    MPS: upsample_bilinear2d_backward_out_mps
+
+- func: upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bilinear2d_backward.grad_input
+
+- func: _upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_bilinear2d_aa_out_cpu
+    CUDA: _upsample_bilinear2d_aa_out_cuda
+
+- func: _upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_bilinear2d_aa.out
+
+- func: _upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_bilinear2d_aa_backward_out_cpu
+    CUDA: _upsample_bilinear2d_aa_backward_out_cuda
+
+- func: _upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_bilinear2d_aa_backward.grad_input
+
+- func: upsample_bicubic2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_bicubic2d_out_cpu
+    CUDA: upsample_bicubic2d_out_cuda
+
+- func: upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bicubic2d.out
+
+- func: upsample_bicubic2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_bicubic2d_backward_out_cpu
+    CUDA: upsample_bicubic2d_backward_out_cuda
+
+- func: upsample_bicubic2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bicubic2d_backward.grad_input
+
+- func: _upsample_bicubic2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_bicubic2d_aa_out_cpu
+    CUDA: _upsample_bicubic2d_aa_out_cuda
+
+- func: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_bicubic2d_aa.out
+
+- func: _upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_bicubic2d_aa_backward_out_cpu
+    CUDA: _upsample_bicubic2d_aa_backward_out_cuda
+
+- func: _upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_bicubic2d_aa_backward.grad_input
+
+- func: upsample_trilinear3d.out(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_trilinear3d_out_cpu
+    CUDA: upsample_trilinear3d_out_cuda
+
+- func: upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_trilinear3d.out
+
+- func: upsample_trilinear3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_trilinear3d_backward_out_cpu
+    CUDA: upsample_trilinear3d_backward_out_cuda
+
+- func: upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_trilinear3d_backward.grad_input
+
+- func: upsample_nearest1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest1d_out_cpu
+    CUDA: upsample_nearest1d_out_cuda
+    MPS: upsample_nearest1d_out_mps
+
+- func: _upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact1d_out_cpu
+    CUDA: _upsample_nearest_exact1d_out_cuda
+    MPS: _upsample_nearest_exact1d_out_mps
+
+- func: upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest1d.out
+
+- func: _upsample_nearest_exact1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact1d.out
+
+- func: upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest1d_backward_out_cpu
+    CUDA: upsample_nearest1d_backward_out_cuda
+    MPS: upsample_nearest1d_backward_out_mps
+
+- func: _upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact1d_backward_out_cpu
+    CUDA: _upsample_nearest_exact1d_backward_out_cuda
+    MPS: _upsample_nearest_exact1d_backward_out_mps
+
+- func: upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest1d_backward.grad_input
+
+- func: _upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact1d_backward.grad_input
+
+- func: upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest2d_out_cpu
+    CUDA: upsample_nearest2d_out_cuda
+    MPS: upsample_nearest2d_out_mps
+
+- func: _upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact2d_out_cpu
+    CUDA: _upsample_nearest_exact2d_out_cuda
+    MPS: _upsample_nearest_exact2d_out_mps
+
+- func: upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest2d.out
+  dispatch:
+    QuantizedCPU: upsample_nearest2d_quantized_cpu
+
+- func: _upsample_nearest_exact2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact2d.out
+  dispatch:
+    QuantizedCPU: _upsample_nearest_exact2d_quantized_cpu
+
+- func: upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest2d_backward_out_cpu
+    CUDA: upsample_nearest2d_backward_out_cuda
+    MPS: upsample_nearest2d_backward_out_mps
+
+- func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact2d_backward_out_cpu
+    CUDA: _upsample_nearest_exact2d_backward_out_cuda
+    MPS: _upsample_nearest_exact2d_backward_out_mps
+
+- func: upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest2d_backward.grad_input
+
+- func: _upsample_nearest_exact2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact2d_backward.grad_input
+
+- func: upsample_nearest3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest3d_out_cpu
+    CUDA: upsample_nearest3d_out_cuda
+
+- func: _upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact3d_out_cpu
+    CUDA: _upsample_nearest_exact3d_out_cuda
+
+- func: upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest3d.out
+  dispatch:
+    QuantizedCPU: upsample_nearest3d_quantized_cpu
+
+- func: _upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact3d.out
+  dispatch:
+    QuantizedCPU: _upsample_nearest_exact3d_quantized_cpu
+
+- func: upsample_nearest3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest3d_backward_out_cpu
+    CUDA: upsample_nearest3d_backward_out_cuda
+
+- func: _upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact3d_backward_out_cpu
+    CUDA: _upsample_nearest_exact3d_backward_out_cuda
+
+- func: upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest3d_backward.grad_input
+
+- func: _upsample_nearest_exact3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact3d_backward.grad_input
+
+- func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sigmoid_backward_out
+    MPS: sigmoid_backward_out_mps
+  tags: pointwise
+
+- func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
+  python_module: nn
+  structured_delegate: sigmoid_backward.grad_input
+  tags: pointwise
+
+- func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: logit_backward_out
+    MPS: logit_backward_out_mps
+  tags: pointwise
+
+- func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
+  python_module: nn
+  structured_delegate: logit_backward.grad_input
+  tags: pointwise
+
+- func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: tanh_backward_out
+    MPS: tanh_backward_out_mps
+  tags: pointwise
+
+- func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
+  python_module: nn
+  structured_delegate: tanh_backward.grad_input
+
+# What's a thnn_conv_ versus a slow_conv_?
+#
+# Historically, we have inefficient implementations of convolutions
+# coming from the THNN/THCUNN library.  These convolutions typically
+# operated by computing the Toeplitz matrix and then doing a matrix
+# multiply with the input; this is very memory inefficient!  However,
+# occasionally, we really don't have anything better, so it's helpful
+# to have these fallbacks when there is no more optimized implementation
+# in cudnn or mkldnn, etc.  Both thnn_ and slow_ convolutions fall
+# into this bucket.
+#
+# The difference between these two designations, is that thnn_ refers
+# to a convolution that is still written in the "legacy" style; that is,
+# C code in the THNN/ or THCUNN/ directory.  A slow_ convolution is
+# one that is written in the native style: modern C++.  Algorithmically,
+# these are the same thing, but we give them different prefixes to
+# make the operational distinction clear.
+  tags: pointwise
+
+- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: slow_conv_transpose2d_structured_cpu
+    CUDA: slow_conv_transpose2d_structured_cuda
+
+- func: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor
+  python_module: nn
+  structured_delegate: slow_conv_transpose2d.out
+
+- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: slow_conv_transpose3d_out_cpu
+    CUDA: slow_conv_transpose3d_out_cuda
+
+- func: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: slow_conv_transpose3d_cpu
+    CUDA: slow_conv_transpose3d_cuda
+
+- func: thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: thnn_conv2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0) -> Tensor
+  python_module: nn
+
+- func: _slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: slow_conv2d_forward_out_cpu
+    CUDA: slow_conv2d_forward_out_cuda
+
+- func: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: slow_conv2d_forward_cpu
+    CUDA: slow_conv2d_forward_cuda
+
+- func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  python_module: nn
+  dispatch:
+    CPU: slow_conv2d_backward_out_cpu
+    CUDA: slow_conv2d_backward_out_cuda
+
+- func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+  python_module: nn
+  dispatch:
+    CPU: slow_conv2d_backward_cpu
+    CUDA: slow_conv2d_backward_cuda
+  autogen: _slow_conv2d_backward.output_mask_out
+
+- func: _conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  python_module: nn
+  dispatch:
+    CUDA: conv_depthwise2d_cuda_out
+
+- func: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor
+  python_module: nn
+  dispatch:
+    CUDA: conv_depthwise2d_cuda
+
+- func: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor
+  python_module: nn
+  dispatch:
+    CUDA: conv_depthwise3d_cuda
+  autogen: conv_depthwise3d.out
+
+- func: slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: slow_conv3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0) -> Tensor
+  python_module: nn
+
+- func: slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: slow_conv3d_forward_out_cpu
+
+- func: slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: slow_conv3d_forward_cpu
+
+- func: slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: slow_conv_dilated2d_cpu
+    CUDA: slow_conv_dilated2d_cuda
+  autogen: slow_conv_dilated2d.out
+
+- func: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: slow_conv_dilated3d_cpu
+    CUDA: slow_conv_dilated3d_cuda
+  autogen: slow_conv_dilated3d.out
+
+- func: col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: col2im_out_cpu
+    CUDA: col2im_out_cuda
+
+- func: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: col2im_cpu
+    CUDA: col2im_cuda
+  tags: core
+
+- func: column_stack(Tensor[] tensors) -> Tensor
+
+- func: column_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: im2col_out_cpu
+    CUDA: im2col_out_cuda
+
+- func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: im2col_cpu
+    CUDA: im2col_cuda
+
+- func: isfinite(Tensor self) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: isinf(Tensor self) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: isinf
+    SparseCPU, SparseCUDA: isinf_sparse
+    SparseMeta: isinf_sparse_meta
+    SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr
+  autogen: isinf.out
+  tags: [core, pointwise]
+
+- func: record_stream(Tensor(a!) self, Stream s) -> ()
+  variants: method
+  dispatch:
+    CUDA: record_stream_cuda
+
+- func: isposinf(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: isposinf.out
+  dispatch:
+    SparseCPU, SparseCUDA: isposinf_sparse
+    SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr
+  tags: pointwise
+
+- func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: isposinf_out
+    SparseCPU, SparseCUDA: isposinf_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr_out
+  tags: pointwise
+
+- func: isneginf(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: isneginf.out
+  dispatch:
+    SparseCPU, SparseCUDA: isneginf_sparse
+    SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr
+  tags: pointwise
+
+- func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: isneginf_out
+    SparseCPU, SparseCUDA: isneginf_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr_out
+  tags: pointwise
+
+# NOTE [_add_batch_dim and _remove_batch_dim]
+# _add_batch_dim and _remove_batch_dim are meant to be used in the implementation
+# of the vmap frontend API (see torch/_vmap_internals.py). They are not
+# user-facing, hence the leading underscore. Please don't use them them anywhere else.
+- func: _add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor
+  variants: function
+
+# See NOTE [_add_batch_dim and _remove_batch_dim]
+- func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor
+  variants: function
+
+## Functions related to the `torch.special` namespace
+# Note [special namespace binding]
+# Functions in the special python module should have their names start with
+#   "special_" underscore and be bound to the desired Python name in
+#   torch/special/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/special.h.
+#   The "special_" names should be hidden from the user and not documented.
+
+- func: special_entr(Tensor self) -> Tensor
+  structured_delegate: special_entr.out
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_entr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_entr_out
+  tags: pointwise
+
+- func: special_ndtri(Tensor self) -> Tensor
+  structured_delegate: special_ndtri.out
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_ndtri.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_ndtri_out
+  tags: pointwise
+
+- func: special_log_ndtr(Tensor self) -> Tensor
+  structured_delegate: special_log_ndtr.out
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_log_ndtr_out
+  tags: pointwise
+
+- func: special_expm1(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_exp2(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_psi(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_digamma(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_gammaln(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_gammaln.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_erf(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_erfc(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+
+- func: special_erfcx(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+  structured_delegate: special_erfcx.out
+  tags: pointwise
+
+- func: special_erfcx.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: special_erfcx_out
+  tags: pointwise
+
+- func: special_erfinv(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+
+- func: special_ndtr(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_xlog1py(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  structured_delegate: special_xlog1py.out
+  tags: pointwise
+
+- func: special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_xlog1py
+  tags: pointwise
+
+- func: special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_xlog1py
+  tags: pointwise
+
+- func: special_xlog1py.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_xlog1py_out
+  tags: pointwise
+
+- func: special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_xlog1py_out
+  tags: pointwise
+
+- func: special_xlog1py.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_xlog1py_out
+  tags: pointwise
+
+- func: special_xlogy(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_xlogy.self_scalar(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_xlogy.other_scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_xlogy.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_xlogy.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_xlogy.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_zeta(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  structured_delegate: special_zeta.out
+  tags: pointwise
+
+- func: special_zeta.self_scalar(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_zeta
+  tags: pointwise
+
+- func: special_zeta.other_scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_zeta
+  tags: pointwise
+
+- func: special_zeta.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_zeta_out
+  tags: pointwise
+
+- func: special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_zeta_out
+  tags: pointwise
+
+- func: special_zeta.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_zeta_out
+  tags: pointwise
+
+- func: special_i0(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_i0e(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+  structured_delegate: special_i0e.out
+  tags: pointwise
+
+- func: special_i0e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: special_i0e_out
+  tags: pointwise
+
+- func: special_i1(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+  structured_delegate: special_i1.out
+  tags: pointwise
+
+- func: special_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: special_i1_out
+  tags: pointwise
+
+- func: special_i1e(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+  structured_delegate: special_i1e.out
+  tags: pointwise
+
+- func: special_i1e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: special_i1e_out
+  tags: pointwise
+
+- func: special_logit(Tensor self, float? eps=None) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+
+- func: special_polygamma(int n, Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+
+- func: special_logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+
+- func: special_expit(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_expit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_sinc(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_round(Tensor self, *, int decimals=0) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_round.out(Tensor self, *, int decimals=0, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_log1p(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_log_softmax(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_gammainc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_gammainc(Tensor self, Tensor other) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_gammaincc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_gammaincc(Tensor self, Tensor other) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_multigammaln(Tensor self, int p) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_multigammaln.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  python_module: special
+  variants: function
+
+## Functions related to the fast Fourier transform and the torch.fft namespace
+# Note [FFT namespace binding]
+# Functions in the fft python module should have their names start with
+#   "fft_" underscore and be bound to the desired Python name in
+#   torch/fft/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/fft.h.
+#   The "fft_" names should be hidden from the user and not documented.
+#
+# See fft_fft as an example.
+
+# torch.fft.fft
+# NOTE: NOT an alias for torch.fft, which has different semantics
+- func: fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fft_symint
+
+- func: fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fft_symint_out
+
+- func: fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifft_symint
+
+- func: fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifft_symint_out
+
+- func: fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfft_symint
+
+- func: fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfft_symint_out
+
+- func: fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfft_symint
+
+- func: fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfft_symint_out
+
+- func: fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfft_symint
+
+- func: fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfft_symint_out
+
+- func: fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfft_symint
+
+- func: fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfft_symint_out
+
+- func: fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fft2_symint
+
+- func: fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fft2_symint_out
+
+- func: fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifft2_symint
+
+- func: fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifft2_symint_out
+
+- func: fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfft2_symint
+
+- func: fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfft2_symint_out
+
+- func: fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfft2_symint
+
+- func: fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfft2_symint_out
+
+- func: fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfft2_symint
+
+- func: fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfft2_symint_out
+
+- func: fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfft2_symint
+
+- func: fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfft2_symint_out
+
+- func: fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fftn_symint
+
+- func: fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fftn_symint_out
+
+- func: fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifftn_symint
+
+- func: fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifftn_symint_out
+
+- func: fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfftn_symint
+
+- func: fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfftn_symint_out
+
+- func: fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfftn_symint
+
+- func: fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfftn_symint_out
+
+- func: fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfftn_symint
+
+- func: fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfftn_symint_out
+
+- func: fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfftn_symint
+
+- func: fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfftn_symint_out
+
+- func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fft_fftfreq
+
+- func: fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fft_fftfreq_out
+
+- func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fft_rfftfreq
+
+- func: fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fft_rfftfreq_out
+
+- func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor
+  python_module: fft
+  variants: function
+
+- func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor
+  python_module: fft
+  variants: function
+
+## Functions for linear algebra and the torch.linalg namespace
+# Note [linalg namespace binding]
+# Functions in the linalg python module should have their names start with
+#   "linalg_" and be bound to the desired Python name in
+#   torch/linalg/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/linalg.h.
+#   The "linalg_" names should be hidden from the user and not documented.
+#
+# See linalg_det as an example.
+
+# "_ex" stands for experimental
+- func: linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info)
+  python_module: linalg
+  structured_delegate: linalg_cholesky_ex.L
+
+- func: linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info)
+  python_module: linalg
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_cholesky_ex_out
+
+- func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor
+  python_module: linalg
+
+- func: linalg_cholesky.out(Tensor self, *, bool upper=False, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor
+  python_module: linalg
+  variants: function
+  structured_delegate: linalg_cross.out
+  dispatch:
+    ZeroTensor: linalg_cross_zerotensor
+
+- func: linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  structured: True
+  dispatch:
+    CPU, CUDA, MPS: linalg_cross_out
+
+# linalg.lu_factor
+- func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots)
+  python_module: linalg
+  variants: function
+
+- func: linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots)
+  python_module: linalg
+  variants: function
+
+- func: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)
+  python_module: linalg
+  structured_delegate: linalg_lu_factor_ex.out
+  variants: function
+
+- func: linalg_lu_factor_ex.out(Tensor A, *, bool pivot=True, bool check_errors=False, Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info)
+  python_module: linalg
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_lu_factor_ex_out
+
+# linalg.lu
+- func: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
+  python_module: linalg
+  structured_delegate: linalg_lu.out
+  variants: function
+
+- func: linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
+  python_module: linalg
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_lu_out
+
+# linalg.lu_solve
+- func: linalg_lu_solve(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False) -> Tensor
+  python_module: linalg
+  structured_delegate: linalg_lu_solve.out
+  variants: function
+
+- func: linalg_lu_solve.out(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_lu_solve_out
+
+# linalg.det
+- func: _linalg_det(Tensor A) -> (Tensor result, Tensor LU, Tensor pivots)
+  structured_delegate: _linalg_det.result
+
+- func: _linalg_det.result(Tensor A, *, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots)
+  structured: True
+  dispatch:
+    CPU, CUDA: _linalg_det_out
+
+- func: linalg_det(Tensor A) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_det.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+# torch.det, alias for torch.linalg.det
+- func: det(Tensor self) -> Tensor
+  variants: function, method
+
+- func: linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info)
+  structured_delegate: linalg_ldl_factor_ex.out
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info)
+  structured: True
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_ldl_factor_ex_out
+
+- func: linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots)
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots)
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor
+  structured_delegate: linalg_ldl_solve.out
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_ldl_solve_out
+
+- func: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
+  python_module: linalg
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: linalg_lstsq
+  tags: dynamic_output_shape
+
+- func: linalg_lstsq.out(Tensor self, Tensor b, float? rcond=None, *, str? driver=None, Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) -> (Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values)
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_lstsq_out
+  tags: dynamic_output_shape
+
+# torch.linalg.matmul, alias for torch.matmul
+- func: linalg_matmul(Tensor self, Tensor other) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_vecdot(Tensor x, Tensor y, *, int dim=-1) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_vecdot.out(Tensor x, Tensor y, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_matrix_exp(Tensor self) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_matrix_exp
+  autogen: linalg_matrix_exp.out
+
+- func: _linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet, Tensor LU, Tensor pivots)
+  structured_delegate: _linalg_slogdet.sign
+
+- func: _linalg_slogdet.sign(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots) -> (Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots)
+  structured: True
+  dispatch:
+    CPU, CUDA: _linalg_slogdet_out
+
+- func: linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet)
+  python_module: linalg
+
+- func: linalg_slogdet.out(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet)
+  python_module: linalg
+
+- func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
+  variants: function, method
+
+- func: slogdet.out(Tensor self, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet)
+  variants: function
+
+- func: logdet(Tensor self) -> Tensor
+  variants: function, method
+
+- func: linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors)
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_eig
+
+- func: linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: linalg_eig_out
+
+- func: _linalg_eigvals(Tensor self) -> Tensor
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: _linalg_eigvals
+
+- func: linalg_eigvals(Tensor self) -> Tensor
+  python_module: linalg
+
+- func: linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: linalg_eigvals_out
+
+# This function is exposes the `compute_v` flag, which is then used to implement `linalg.eigh` and
+# `linalg.eigvalsh` as composite functions that call this one
+- func: _linalg_eigh(Tensor A, str UPLO="L", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors)
+  structured_delegate: _linalg_eigh.eigenvalues
+
+- func: _linalg_eigh.eigenvalues(Tensor A, str UPLO="L", bool compute_v=True, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+  structured: True
+  dispatch:
+    CPU, CUDA: _linalg_eigh_out
+
+- func: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors)
+  python_module: linalg
+
+- func: linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+  python_module: linalg
+
+- func: linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor
+  python_module: linalg
+
+- func: linalg_eigvalsh.out(Tensor self, str UPLO="L", *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_householder_product(Tensor input, Tensor tau) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_householder_product
+
+- func: linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: linalg_householder_product_out
+
+- func: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
+  python_module: linalg
+  structured_delegate: linalg_inv_ex.inverse
+
+- func: linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info)
+  python_module: linalg
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_inv_ex_out
+    MPS: linalg_inv_ex_out_mps
+
+- func: linalg_inv(Tensor A) -> Tensor
+  python_module: linalg
+
+- func: linalg_inv.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: inverse(Tensor self) -> Tensor
+  variants: function, method
+
+- func: inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: inner(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: outer(Tensor self, Tensor vec2) -> Tensor
+  variants: function, method
+
+- func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+
+# torch.ger, alias for torch.outer
+- func: ger(Tensor self, Tensor vec2) -> Tensor
+  variants: function, method
+
+- func: ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_norm.out(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_norm.ord_str_out(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+  variants: function
+  structured_delegate: linalg_vector_norm.out
+
+- func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_vector_norm_out
+    MPS: linalg_vector_norm_out_mps
+
+- func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+
+- func: linalg_matrix_norm.out(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_matrix_norm.str_ord(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+
+- func: linalg_matrix_norm.str_ord_out(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+# This function is exposes the `compute_uv` flag, which is then used to implement `linalg.svd` and
+# `linalg.svdvals` as composite functions that call this one
+- func: _linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)
+  variants: function
+  structured_delegate: _linalg_svd.U
+
+- func: _linalg_svd.U(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)
+  structured: True
+  dispatch:
+    CPU, CUDA: _linalg_svd_out
+
+- func: linalg_svd(Tensor A, bool full_matrices=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)
+  python_module: linalg
+  variants: function
+
+- func: linalg_svd.U(Tensor A, bool full_matrices=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)
+  python_module: linalg
+  variants: function
+
+- func: linalg_svdvals(Tensor A, *, str? driver=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_svdvals.out(Tensor A, *, str? driver=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_cond(Tensor self, Scalar? p=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_cond.out(Tensor self, Scalar? p=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_cond.p_str(Tensor self, str p) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_cond.p_str_out(Tensor self, str p, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv.atol_rtol_tensor(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    # calls svd, which calls mH() (view op)
+    # also calls narrow()
+    CompositeExplicitAutogradNonFunctional: linalg_pinv
+
+- func: linalg_pinv.atol_rtol_tensor_out(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: linalg_pinv_out
+
+- func: linalg_pinv.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor
+  cpp_no_default_args: ['atol', 'rtol']
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  cpp_no_default_args: ['atol', 'rtol']
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv(Tensor self, float rcond, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv.rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv.out(Tensor self, float rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv.out_rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: _linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor LU, Tensor pivots, Tensor info)
+  structured_delegate: _linalg_solve_ex.result
+
+- func: _linalg_solve_ex.result(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info)
+  structured: True
+  dispatch:
+    CPU, CUDA: _linalg_solve_ex_out
+
+- func: linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor info)
+  python_module: linalg
+
+- func: linalg_solve_ex.out(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) info) -> (Tensor(a!) result, Tensor(b!) info)
+  python_module: linalg
+
+- func: linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor
+  python_module: linalg
+
+- func: linalg_solve.out(Tensor A, Tensor B, *, bool left=True, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_tensorinv(Tensor self, int ind=2) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_tensorinv.out(Tensor self, int ind=2, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_tensorsolve.out(Tensor self, Tensor other, int[]? dims=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)
+  python_module: linalg
+  variants: function
+  structured_delegate: linalg_qr.out
+
+- func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+  python_module: linalg
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_qr_out
+
+- func: linalg_matrix_power(Tensor self, int n) -> Tensor
+  python_module: linalg
+
+- func: linalg_matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_matrix_rank.atol_rtol_tensor(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.atol_rtol_tensor_out(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor
+  cpp_no_default_args: ['atol', 'rtol']
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  cpp_no_default_args: ['atol', 'rtol']
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank(Tensor self, float tol, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.out(Tensor self, float tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.tol_tensor(Tensor input, Tensor tol, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.out_tol_tensor(Tensor input, Tensor tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_multi_dot(Tensor[] tensors) -> Tensor
+  python_module: linalg
+
+- func: linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+## Functions related to the `torch.nested` namespace
+# Note [nested namespace binding]
+# Functions in the nested python module should have their names start with
+#   "nested_" underscore and be bound to the desired Python name in
+#   torch/nested/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/nested.h.
+#   The "nested_" names should be hidden from the user and not documented.
+
+- func: nested_to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
+  python_module: nested
+  variants: function
+
+## Functions that are only for testing
+# It is undocumented and should not be used outside of tests.
+- func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
+
+# Note: for testing COW materialization within `at::parallel_for` loop function
+- func: _test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _test_parallel_materialize
+
+# Note: this function is only for testing.
+- func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: _test_optional_intlist
+  autogen: _test_optional_intlist.out
+
+# Note: this function is only for testing.
+- func: _test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: _test_optional_intlist
+  autogen: _test_optional_filled_intlist.out
+
+# Note: this function is only for testing.
+- func: _test_optional_floatlist(Tensor values, float[]? addends) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: _test_optional_floatlist
+  autogen: _test_optional_floatlist.out
+
+# Note: this function is only for testing.
+- func: _test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor
+  python_module: nn
+
+# Note: this function is only for testing.
+- func: _test_ambiguous_defaults.a(Tensor dummy, int a=1, int b=1) -> Tensor
+  python_module: nn
+
+# Note: this function is only for testing.
+- func: _test_ambiguous_defaults.b(Tensor dummy, int a=2, str b="2") -> Tensor
+  cpp_no_default_args: ['a', 'b']
+  python_module: nn
+
+# Note: this function is only for testing.
+- func: _test_warn_in_autograd(Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: _test_warn_in_autograd
+  autogen: _test_warn_in_autograd.out
+
+# Note: this function is only for testing.
+- func: _test_autograd_multiple_dispatch.fullcoverage(Tensor self) -> Tensor
+  dispatch:
+    # the NestedTensor keys are necessary because NestedTensor has been removed
+    # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys]
+    CompositeExplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
+  autogen: _test_autograd_multiple_dispatch.fullcoverage_out
+
+# Note: this function is only for testing.
+- func: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
+
+# Note: this function is only for testing.
+- func: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)
+  dispatch:
+    CompositeExplicitAutograd: _test_autograd_multiple_dispatch_view
+
+# Note: this function is only for testing.
+- func: _test_autograd_multiple_dispatch_view_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _test_autograd_multiple_dispatch_view_copy
+  tags: view_copy
+  autogen: _test_autograd_multiple_dispatch_view_copy.out
+
+- func: segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: segment_reduce_kernel
+  autogen: segment_reduce.out
+
+- func: _segment_reduce_backward(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, Tensor? offsets=None, int axis=0, Scalar? initial=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: _segment_reduce_backward_kernel
+  autogen: _segment_reduce_backward.out
+
+- func: pad_sequence(Tensor[] sequences, bool batch_first=False, float padding_value=0.0) -> Tensor
+  python_module: nn
+  variants: function
+
+- func: flatten_dense_tensors(Tensor[] tensors) -> Tensor
+  variants: function
+  python_module: nn
+
+- func: unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[]
+  variants: function
+  python_module: nn
+
+- func: _nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _nested_tensor_from_tensor_list
+  autogen: _nested_tensor_from_tensor_list.out
+
+- func: _fw_primal_copy(Tensor self, int level) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _fw_primal_copy
+  tags: view_copy
+  autogen: _fw_primal_copy.out
+
+- func: _make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _make_dual_copy
+  tags: view_copy
+  autogen: _make_dual_copy.out
+
+- func: view_as_real_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_as_real_copy
+  tags: view_copy
+  autogen: view_as_real_copy.out
+
+- func: view_as_complex_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_as_complex_copy
+  tags: view_copy
+  autogen: view_as_complex_copy.out
+
+- func: _conj_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _conj_copy
+  tags: view_copy
+  autogen: _conj_copy.out
+
+- func: _neg_view_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _neg_view_copy
+  tags: view_copy
+  autogen: _neg_view_copy.out
+
+- func: as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: as_strided_copy_symint
+  tags: view_copy
+  autogen: as_strided_copy.out
+
+- func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _sparse_broadcast_to_copy
+  tags: view_copy
+  autogen: _sparse_broadcast_to_copy.out
+
+- func: diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: diagonal_copy
+  tags: view_copy
+  autogen: diagonal_copy.out
+
+- func: expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: expand_copy_symint
+  tags: view_copy
+  autogen: expand_copy.out
+
+- func: permute_copy(Tensor self, int[] dims) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: permute_copy
+  tags: view_copy
+  autogen: permute_copy.out
+
+- func: _reshape_alias_copy(Tensor self, SymInt[] size, SymInt[] stride) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _reshape_alias_copy_symint
+  tags: view_copy
+  autogen: _reshape_alias_copy.out
+
+- func: select_copy.int(Tensor self, int dim, SymInt index) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: select_copy_symint
+    SparseCsrCPU, SparseCsrCUDA: select_copy_sparse_csr
+  tags: view_copy
+  autogen: select_copy.int_out
+
+- func: detach_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: detach_copy
+  tags: view_copy
+  autogen: detach_copy.out
+
+- func: slice_copy.Tensor(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: slice_copy_Tensor_symint
+  tags: view_copy
+  autogen: slice_copy.Tensor_out
+
+- func: split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: split_copy_Tensor_symint
+  tags: view_copy
+
+- func: split_with_sizes_copy(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: split_with_sizes_copy_symint
+  tags: view_copy
+
+- func: squeeze_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: squeeze_copy
+  tags: view_copy
+  autogen: squeeze_copy.out
+
+- func: squeeze_copy.dim(Tensor self, int dim) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: squeeze_copy_dim
+  tags: view_copy
+  autogen: squeeze_copy.dim_out
+
+- func: squeeze_copy.dims(Tensor self, int[] dim) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: squeeze_copy_dims
+  tags: view_copy
+  autogen: squeeze_copy.dims_out
+
+- func: t_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: t_copy
+  tags: view_copy
+  autogen: t_copy.out
+
+- func: transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: transpose_copy_int
+  tags: view_copy
+  autogen: transpose_copy.int_out
+
+- func: unsqueeze_copy(Tensor self, int dim) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: unsqueeze_copy
+  tags: view_copy
+  autogen: unsqueeze_copy.out
+
+- func: _indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _indices_copy
+  tags: view_copy
+  autogen: _indices_copy.out
+
+- func: _values_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _values_copy
+  tags: view_copy
+  autogen: _values_copy.out
+
+- func: indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: indices_copy
+  tags: view_copy
+  autogen: indices_copy.out
+
+- func: values_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: values_copy
+  tags: view_copy
+  autogen: values_copy.out
+
+- func: crow_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: crow_indices_copy
+  tags: view_copy
+  autogen: crow_indices_copy.out
+
+- func: col_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: col_indices_copy
+  tags: view_copy
+  autogen: col_indices_copy.out
+
+- func: ccol_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: ccol_indices_copy
+  tags: view_copy
+  autogen: ccol_indices_copy.out
+
+- func: row_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: row_indices_copy
+  tags: view_copy
+  autogen: row_indices_copy.out
+
+- func: unbind_copy.int(Tensor self, int dim=0) -> Tensor[]
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: unbind_copy_int
+  tags: view_copy
+
+- func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: unbind_copy_int_out
+
+- func: split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: split_copy_Tensor_out
+
+
+- func: split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: split_with_sizes_copy_out
+    CUDA: split_with_sizes_copy_out_cuda
+
+- func: view_copy(Tensor self, SymInt[] size) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_copy_symint
+  tags: view_copy
+  autogen: view_copy.out
+
+- func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_copy_dtype
+  tags: view_copy
+  autogen: view_copy.dtype_out
+
+- func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: unfold_copy
+  tags: view_copy
+  autogen: unfold_copy.out
+
+- func: alias_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: alias_copy
+  tags: view_copy
+  autogen: alias_copy.out
+
+- func: to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU: NestedTensor_to_padded_tensor_generic
+    NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
+  autogen: to_padded_tensor.out
+
+- func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
+  dispatch:
+    NestedTensorCPU: NestedTensor_softmax_dropout
+    NestedTensorCUDA: NestedTensor_softmax_dropout_cuda
+  tags: nondeterministic_seeded
+
+# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
+- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
+  autogen: _transformer_encoder_layer_fwd.out
+
+- func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU, NestedTensorCPU: native_multi_head_attention_cpu
+    CUDA, NestedTensorCUDA: native_multi_head_attention_cuda
+  autogen: _native_multi_head_attention.out
+
+- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor
+  python_module: nn
+  variants: function
+  autogen: scaled_dot_product_attention.out
+  tags: nondeterministic_seeded
+
+# This aten function is kept so that we can test the choice function from Python
+- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int
+  dispatch:
+    Meta: _fused_sdp_choice_meta
+    CPU, NestedTensorCPU: _fused_sdp_choice_cpp
+    CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor)
+  variants: function
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+  dispatch:
+    CUDA: _scaled_dot_product_flash_attention_cuda
+    NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp)
+  dispatch:
+    CPU: _scaled_dot_product_flash_attention_cpu
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _scaled_dot_product_flash_attention_backward_cuda
+    NestedTensorCUDA: _scaled_dot_product_flash_attention_backward_nested
+
+- func: _scaled_dot_product_flash_attention_for_cpu_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, float dropout_p, bool is_causal, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: _scaled_dot_product_flash_attention_cpu_backward
+
+- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
+  dispatch:
+    CUDA: _scaled_dot_product_efficient_attention_cuda
+    NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor)
+  device_check: NoCheck
+  dispatch:
+    CUDA: _scaled_dot_product_efficient_attention_backward_cuda
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
+  dispatch:
+    CUDA: _scaled_dot_product_cudnn_attention_cuda
+  tags: nondeterministic_seeded
+
+- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+  variants: function
+  dispatch:
+    CUDA: _flash_attention_forward
+  tags: nondeterministic_seeded
+
+- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _flash_attention_backward
+
+# Returns output, logsumexp if compute_logsumexp
+- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_forward
+  tags: nondeterministic_seeded
+
+- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_backward
+
+- func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: triton_scaled_dot_attention
+  tags: nondeterministic_seeded
+  autogen: _triton_scaled_dot_attention.out
+
+- func: _fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CUDA: _fill_mem_eff_dropout_mask_
+  tags: nondeterministic_seeded
+
+- func: _triton_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: triton_multi_head_attention
+  autogen: _triton_multi_head_attention.out
+
+- func: special_airy_ai(Tensor x) -> Tensor
+  python_module: special
+  structured_delegate: special_airy_ai.out
+  variants: function
+  tags: pointwise
+
+- func: special_airy_ai.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_airy_ai_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_j0(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_bessel_j0.out
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_bessel_j0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_j1(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_bessel_j1.out
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_bessel_j1_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_y0(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_bessel_y0.out
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_bessel_y0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_y1(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_bessel_y1.out
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_bessel_y1_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_chebyshev_polynomial_t.out
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_t
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_t
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_chebyshev_polynomial_t_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_t_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_t_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_chebyshev_polynomial_u.out
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_u
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_u
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_chebyshev_polynomial_u_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_u_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_u_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_chebyshev_polynomial_v.out
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_v
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_v
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_chebyshev_polynomial_v_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_v_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_v_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_chebyshev_polynomial_w.out
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_w
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_w
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_chebyshev_polynomial_w_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_w_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_w_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_hermite_polynomial_h.out
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_h
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_h
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_hermite_polynomial_h_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_h_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_h_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_hermite_polynomial_he.out
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_he
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_he
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_hermite_polynomial_he_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_he_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_he_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_laguerre_polynomial_l.out
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_laguerre_polynomial_l
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_laguerre_polynomial_l
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_laguerre_polynomial_l_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_laguerre_polynomial_l_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_laguerre_polynomial_l_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_legendre_polynomial_p.out
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_legendre_polynomial_p
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_legendre_polynomial_p
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_legendre_polynomial_p_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_legendre_polynomial_p_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_legendre_polynomial_p_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_i0(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_modified_bessel_i0.out
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_modified_bessel_i0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_i1(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_modified_bessel_i1.out
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_modified_bessel_i1_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_k0(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_modified_bessel_k0.out
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_modified_bessel_k0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_k1(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_modified_bessel_k1.out
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_modified_bessel_k1_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_scaled_modified_bessel_k0(Tensor x) -> Tensor
+  python_module: special
+  structured_delegate: special_scaled_modified_bessel_k0.out
+  variants: function
+  tags: pointwise
+
+- func: special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_scaled_modified_bessel_k0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_scaled_modified_bessel_k1(Tensor x) -> Tensor
+  python_module: special
+  structured_delegate: special_scaled_modified_bessel_k1.out
+  variants: function
+  tags: pointwise
+
+- func: special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_scaled_modified_bessel_k1_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_shifted_chebyshev_polynomial_t.out
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_shifted_chebyshev_polynomial_t_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_shifted_chebyshev_polynomial_u.out
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_shifted_chebyshev_polynomial_u_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_shifted_chebyshev_polynomial_v.out
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_shifted_chebyshev_polynomial_v_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_shifted_chebyshev_polynomial_w.out
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_shifted_chebyshev_polynomial_w_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_spherical_bessel_j0(Tensor x) -> Tensor
+  python_module: special
+  structured_delegate: special_spherical_bessel_j0.out
+  variants: function
+  tags: pointwise
+
+- func: special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_spherical_bessel_j0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+# Aux function used in the test TestPythonDispatch.test_kwarg_only_and_positional_default
+# within test/test_python_dispatch.py
+- func: _foobar(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True) -> Tensor
+  dispatch:
+    CPU: foobar
+  autogen: _foobar.out
+
+# Fused Optimizer CUDA kernels.
+- func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  variants: function
+  dispatch:
+    CUDA: _fused_adam_kernel_cuda_
+  autogen: _fused_adam, _fused_adam.out
+
+- func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
+  # but still skip the device check as the Tensor LR can be on CPU
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _fused_adam_kernel_cuda_
+  autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
+
+- func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  variants: function
+  dispatch:
+    CUDA: _fused_adamw_kernel_cuda_
+  autogen: _fused_adamw, _fused_adamw.out
+
+- func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
+  # but still skip the device check as the Tensor LR can be on CPU
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _fused_adamw_kernel_cuda_
+  autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
+
+- func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  variants: function
+  dispatch:
+    CUDA: _fused_sgd_kernel_cuda_
+  autogen: _fused_sgd, _fused_sgd.out
+
+- func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  # but still skip the device check as the Tensor LR can be on CPU
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _fused_sgd_kernel_cuda_
+  autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
+
+# This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
+- func: _propagate_xla_data(Tensor input, Tensor output) -> ()
+  variants: function
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/native/tags.yaml b/MLPY/Lib/site-packages/torchgen/packaged/ATen/native/tags.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c174c67a018890ead753ecce5629a11320aa418f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/native/tags.yaml
@@ -0,0 +1,65 @@
+# This yaml file contains all the possible tags that can be defined in `tags` in `native_functions.yaml`
+
+- tag: inplace_view
+  desc: |
+          This tag indicates if an operator *only* modifies the tensor metadata
+- tag: pt2_compliant_tag
+  desc: |
+          This tag indicates if the operator is guaranteed to
+          work with the PT2 compilation APIs (torch.compile,
+          torch.export, etc). If you add this tag to an
+          operator, please use
+          `torch.testing._internal.optest.opcheck` to test that
+          the operator has been registered correctly and
+          works with torch.compile
+- tag: view_copy
+  desc: |
+          This tag indicates operators that are *_copy* variants
+          of view/aliasing operators. If an operator has a view_copy tag,
+          then it should have the name {op}_copy, where {op} is a view operator.
+- tag: dynamic_output_shape
+  desc: |
+          This tag indicates if an operator's output's shape depends on input Tensor
+          data.
+- tag: data_dependent_output
+  desc: |
+          Operator has a non-Tensor output whose value is dependent on the data
+          of Tensor inputs.  Among other things, this implies that this operator
+          cannot be run with meta tensor (since data is not available), nor
+          can it be symbolically traced.
+- tag: generated
+  desc: |
+          This tag indicates that the operator doesn't have an explicit entry in
+          native_functions.yaml, and instead was generated automatically by the codegen.
+- tag: nondeterministic_seeded
+  desc: |
+          This tag indicates if an operator is nondeterministically seeded
+          (i.e., is random) such that the operator intentionally produces
+          different results when run twice on the same inputs, but this randomness
+          is controlled by a Generator which, if reseeded would give you the
+          same result.
+- tag: nondeterministic_bitwise
+  desc: |
+          This tag indicates if an operator doesn't guarantee bitwise equivalence
+          across different runs of an operator with identical inputs.
+- tag: needs_fixed_stride_order
+  desc: |
+          This tag indicates that the operator should be passed Tensors following
+          the same stride permutation as observed in eager when compiled in inductor.
+
+# NOTE [Core ATen Ops]
+- tag: core
+  desc: |
+          Core aten ops is a subset of aten ops that remains after aten-to-aten decomposition and
+          functionalization pass. Core aten ops are fully functional and adhere to single static
+          assignment (SSA): this implies there will be no `inplace` or `_out` variants in this opset.
+          This opset is designed to serve as the functional IR to interface with compiler backends.
+          In contrast to primTorch, core aten opset doesn't decompose ops into explicit
+          type promotion and broadcasting ops.
+          Core aten ops is also effectively the opset produced by torchdynamo.export(aten_graph=True),
+          and thus can be used as an opset for export purpose.
+- tag: pointwise
+  desc: |
+          Pointwise operators are operators where each element of the output is computed only by accessing
+          the corresponding element of all the broadcasted inputs. The output shape will be the broadcasted
+          shape of the inputs.
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/ATenOpList.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/ATenOpList.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e2cdef2ba05d346a852d1873251df006a5a6bba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/ATenOpList.cpp
@@ -0,0 +1,36 @@
+#include <ATen/core/ATenOpList.h>
+
+#include <string>
+#include <cstring>
+#include <utility>
+#include <unordered_set>
+#include <ATen/core/operator_name.h>
+
+// ${generated_comment}
+
+namespace at {
+
+namespace {
+struct OpNameEquals final {
+  bool operator()(const std::pair<const char*, const char*>& lhs, const std::pair<const char*, const char*>& rhs) const {
+      return 0 == strcmp(lhs.first, rhs.first) && 0 == strcmp(lhs.second, rhs.second);
+  }
+};
+
+struct OpNameHash final {
+  size_t operator()(const std::pair<const char*, const char*>& p) const {
+      // use std::hash<std::string> because std::hash<const char*> would hash pointers and not pointed-to strings
+      return std::hash<std::string>()(p.first) ^ (~ std::hash<std::string>()(p.second));
+  }
+};
+}
+
+bool is_custom_op(const c10::OperatorName& opName) {
+  static std::unordered_set<std::pair<const char*, const char*>, OpNameHash, OpNameEquals> ops {
+    ${aten_ops}
+    {"", ""}
+  };
+  return ops.count(std::make_pair(
+             opName.name.c_str(), opName.overload_name.c_str())) == 0;
+}
+}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a950b6356078215e04328e3d1eca6df19d0060a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp
@@ -0,0 +1,73 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include <ATen/InferSize.h>
+#include <ATen/Tensor.h>
+#include <ATen/native/Resize.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+#include <ATen/ops/clone.h>
+$ops_headers
+#endif
+
+namespace at {
+namespace native {
+
+// This file contains a number of kernels for aten functions that are fully code-generated.
+// TODO: rename this file to something more generic.
+
+namespace {
+at::Tensor clone_arg(const at::Tensor& t) {
+    return t.clone();
+}
+
+std::vector<at::Tensor> clone_arg(const at::TensorList& t_list) {
+    std::vector<at::Tensor> out(t_list.size());
+    for (const auto& i : c10::irange(t_list.size())) {
+        out[i] = t_list[i].clone();
+    }
+    return out;
+}
+
+// duped with gen_resize_out_helper from structured kernels
+void copy_arg(const at::Tensor& dst, const at::Tensor& src) {
+    TORCH_CHECK(src.dtype() == dst.dtype(),
+        "Expected out tensor to have dtype ", src.dtype(), ", but got ", dst.dtype(), " instead");
+    TORCH_CHECK(src.device() == dst.device(),
+        "Expected out tensor to have device ", src.device(), ", but got ", dst.device(), " instead");
+    dst.copy_(src);
+}
+
+void copy_arg(const at::TensorList& dst, const at::TensorList& src) {
+    TORCH_INTERNAL_ASSERT(dst.size() == src.size());
+    for (const auto& i : c10::irange(dst.size())) {
+        copy_arg(dst[i], src[i]);
+    }
+}
+
+// TODO: this doesn't handle restriding empty tensors correctly; see
+// gen_resize_out_helper for the correct algorithm
+
+void resize_out_helper(const at::Tensor& dst, const at::Tensor& src) {
+    at::native::resize_output(dst, src.sizes());
+}
+
+void resize_out_helper(const at::TensorList& dst, const at::TensorList& src) {
+    TORCH_INTERNAL_ASSERT(dst.size() == src.size());
+    for (const auto& i : c10::irange(dst.size())) {
+        at::native::resize_output(dst[i], src[i].sizes());
+    }
+}
+}
+
+
+${CompositeViewCopyKernel_Definitions}
+
+${GeneratedCompositeFunctional_Definitions}
+
+${GeneratedCompositeOut_Definitions}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunction.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunction.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bd999183177c647735f6db51ae19619113b2df2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunction.h
@@ -0,0 +1,23 @@
+#pragma once
+// ${generated_comment}
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace ${dispatch_namespace} {
+
+${dispatch_namespaced_declarations}
+
+} // namespace ${dispatch_namespace}
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunctions.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5e0a996fe48ae43bd7bad98894fc0e1b4c56f68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunctions.h
@@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+${inline_headers}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunctions_inl.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunctions_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e9fe55a26ba9915b231d541880e3e8c9dd2bcec
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunctions_inl.h
@@ -0,0 +1,22 @@
+#pragma once
+// ${generated_comment}
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_${dispatch_namespace}_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+${DispatchKeyFunctions_inl_includes}
+
+
+${dispatch_namespaced_declarations}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..604a7bcb6275616ebe98756378f7feaffe4a6856
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.cpp
@@ -0,0 +1,13 @@
+// ${generated_comment}
+${includes}
+${native_functions_include}
+
+namespace {
+${helper_fns}
+} // namespace
+
+${namespace_prologue}
+
+${native_function_definitions}
+
+${namespace_epilogue}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..e616b4d7ef360ab4e2223a55a11fe5d423efc8c2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.h
@@ -0,0 +1,19 @@
+#pragma once
+
+// an external backend might generate file within its code tree
+// and check all the source files within the tree with clang-format.
+// so, disable it since the backend might have a different config.
+// clang-format off
+
+// ${generated_comment}
+
+#include <ATen/Tensor.h>
+
+${namespace_prologue}
+
+struct ${class_name} {
+
+${dispatch_declarations}
+
+};
+${namespace_epilogue}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Function.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Function.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2fd0f16b7685e7baa2c8f22a7d7e73a4472eb4a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Function.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+${static_dispatch_ops_headers}
+
+${operator_includes}
+
+namespace at {
+
+${function_definitions}
+
+}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/FunctionalInverses.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/FunctionalInverses.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a817670b7cd6dd80404fc6a512c51e3ed8b8352
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/FunctionalInverses.h
@@ -0,0 +1,33 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/Tensor.h>
+
+namespace at {
+namespace functionalization {
+
+enum class InverseReturnMode {
+  /// Specifies that functional inverses should always return a view.
+  AlwaysView,
+  /// Specifies that functional inverses should always return a non-view / copy.
+  NeverView,
+  /// Specifies that functional inverses should return a view unless a (copying) scatter
+  /// inverse exists, in which case that will be used instead.
+  /// This avoids as_strided() calls that can be difficult for subclasses to handle.
+  ViewOrScatterInverse,
+};
+
+struct FunctionalInverses {
+
+${view_inverse_declarations}
+
+// NB: These are not generated! They're manually implemented in the template.
+// TODO: Change codegen to generate these. See the following link:
+// https://github.com/pytorch/pytorch/blob/main/torchgen/model.py#L2583-L2585
+static at::Tensor chunk_inverse(const at::Tensor & base, const at::Tensor & mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, int chunks, int dim);
+static at::Tensor narrow_inverse(const at::Tensor & base, const at::Tensor & mutated_view, InverseReturnMode inverse_return_mode, int dim, c10::SymInt start, c10::SymInt length);
+
+};
+}
+}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Functions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7673e525b36bd688e8f8efe3329b9ea778a98b2d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Functions.cpp
@@ -0,0 +1,103 @@
+#include <array>
+
+#include <ATen/Functions.h>
+#include <ATen/Utils.h>
+#include <c10/core/Allocator.h>
+
+namespace at {
+
+Tensor TensorMaker::make_tensor() {
+   AutoDispatchBelowADInplaceOrView guard{}; // TODO: Remove.
+   tracer::impl::NoTracerDispatchMode tracer_guard{};
+
+   check_size_nonnegative(sizes_);
+
+   TORCH_CHECK_VALUE(
+       !deleter_ || !ctx_,
+       "The deleter and context arguments are mutually exclusive.");
+
+   if (device_ == nullopt) {
+     device_ = globalContext().getDeviceFromPtr(data_, opts_.device().type());
+   }
+
+   if (opts_.device().has_index()) {
+     // clang-format off
+     TORCH_CHECK_VALUE(
+         opts_.device() == *device_,
+         "Specified device ", opts_.device(), " does not match device of data ", *device_);
+     // clang-format on
+   }
+
+   std::size_t size_bytes = computeStorageSize();
+
+   DataPtr data_ptr{};
+   if (deleter_) {
+     data_ptr = makeDataPtrFromDeleter();
+   } else {
+     data_ptr = makeDataPtrFromContext();
+   }
+
+   TORCH_CHECK(!resizeable_ || allocator_ != nullptr, "Must specify an allocator with allocator() if you want to use resizeable_storage()");
+   Storage storage{Storage::use_byte_size_t{}, size_bytes, std::move(data_ptr), /*allocator=*/allocator_, /*resizeable=*/resizeable_};
+
+   Tensor tensor = detail::make_tensor<TensorImpl>(
+       std::move(storage), opts_.computeDispatchKey(), opts_.dtype());
+
+  TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
+  if (strides_) {
+    tensor_impl->set_sizes_and_strides(sizes_, *strides_);
+  } else {
+    tensor_impl->set_sizes_contiguous(sizes_);
+  }
+  if (storage_offset_) {
+    tensor_impl->set_storage_offset(*storage_offset_);
+  }
+
+   return tensor;
+ }
+
+ std::size_t TensorMaker::computeStorageSize() const noexcept {
+   std::size_t itemsize = opts_.dtype().itemsize();
+
+   if (strides_) {
+     auto storage_size = detail::computeStorageNbytes(sizes_, *strides_, itemsize);
+     if (storage_offset_) {
+       storage_size += storage_offset_.value();
+     }
+     return storage_size;
+   }
+
+   std::size_t size = 1;
+   for (std::int64_t s : sizes_) {
+     size *= static_cast<std::size_t>(s);
+   }
+   auto storage_size = size * itemsize;
+   if (storage_offset_) {
+     storage_size += storage_offset_.value();
+   }
+   return storage_size;
+ }
+
+ inline DataPtr TensorMaker::makeDataPtrFromDeleter() noexcept {
+   return InefficientStdFunctionContext::makeDataPtr(data_, std::move(deleter_), *device_);
+ }
+
+ inline DataPtr TensorMaker::makeDataPtrFromContext() noexcept {
+   return DataPtr{data_, ctx_.release(), ctx_.get_deleter(), *device_};
+ }
+
+ IntArrayRef TensorMaker::makeTempSizes() const noexcept {
+   static std::int64_t zeros[5] = {0, 0, 0, 0, 0};
+   if (opts_.has_memory_format()) {
+     MemoryFormat format = *opts_.memory_format_opt();
+     if (format == MemoryFormat::ChannelsLast) {
+       return IntArrayRef(zeros, 4);
+     }
+     if (format == MemoryFormat::ChannelsLast3d) {
+       return IntArrayRef(zeros, 5);
+     }
+   }
+   return IntArrayRef(zeros, 1);
+ }
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Functions.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5ba6a17f6afc016af9d459768f838ebfe15a54b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Functions.h
@@ -0,0 +1,143 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from <ATen/ops/{my_operator}.h> and   \
+  see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+// NOTE: [TORCH_ASSERT_ONLY_METHOD_OPERATORS]
+//
+// In ATen, certain generated headers files include the definitions of
+// every single operator in PyTorch. Unfortunately this means every
+// time an operator signature is updated or changed in
+// native_functions.yaml, you (and every other PyTorch developer) need
+// to recompile every source file that includes any of these headers.
+//
+// To break up these header dependencies, and improve incremental
+// build times for all PyTorch developers. These headers are split
+// into per-operator headers in the `ATen/ops` folder. This limits
+// incremental builds to only changes to methods of `Tensor`, or files
+// that use the specific operator being changed. With `at::sum` as an
+// example, you should include
+//
+//   <ATen/ops/sum.h>               // instead of ATen/Functions.h
+//   <ATen/ops/sum_native.h>        // instead of ATen/NativeFunctions.h
+//   <ATen/ops/sum_ops.h>           // instead of ATen/Operators.h
+//   <ATen/ops/sum_cpu_dispatch.h>  // instead of ATen/CPUFunctions.h
+//
+// However, even if you're careful to use this in your own code.
+// `Functions.h` might be included indirectly through another header
+// without you realising. To avoid this, you can add
+//
+//   #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+//
+// to the top of your source file. This way any time the non-specific
+// headers are included, the compiler will error out.
+//
+// Also, be aware that `ops` are not available in all build
+// configurations (namely fb-internal) so you must guard these
+// includes with `#ifdef AT_PER_OPERATOR_HEADERS`. e.g.
+//
+//   #ifndef AT_PER_OPERATOR_HEADERS
+//   #include <ATen/Functions.h>
+//   #else
+//   #include <ATen/ops/sum.h>
+//   #endif
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <c10/core/SymInt.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
+
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/tensor.h>
+
+${Functions_includes}
+
+namespace at {
+
+${Functions_declarations}
+
+// Special C++ only overloads for std()-like functions (See gh-40287)
+// These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
+// So, for example std(0) would select the std(unbiased=False) overload
+TORCH_API inline Tensor var(const Tensor& self, int dim) {
+  return at::var(self, IntArrayRef{dim});
+}
+TORCH_API inline std::tuple<Tensor, Tensor> var_mean(const Tensor& self, int dim) {
+  return at::var_mean(self, IntArrayRef{dim});
+}
+TORCH_API inline Tensor std(const Tensor& self, int dim) {
+  return at::std(self, IntArrayRef{dim});
+}
+TORCH_API inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
+  return at::std_mean(self, IntArrayRef{dim});
+}
+
+inline int64_t numel(const Tensor& tensor) {
+  return tensor.numel();
+}
+
+inline int64_t size(const Tensor& tensor, int64_t dim) {
+  return tensor.size(dim);
+}
+
+inline int64_t stride(const Tensor& tensor, int64_t dim) {
+  return tensor.stride(dim);
+}
+
+inline bool is_complex(const Tensor& tensor) {
+  return tensor.is_complex();
+}
+
+inline bool is_floating_point(const Tensor& tensor) {
+  return tensor.is_floating_point();
+}
+
+inline bool is_signed(const Tensor& tensor) {
+  return tensor.is_signed();
+}
+
+inline bool is_inference(const Tensor& tensor) {
+  return tensor.is_inference();
+}
+
+inline bool _is_zerotensor(const Tensor& tensor) {
+  return tensor._is_zerotensor();
+}
+
+inline bool is_conj(const Tensor& tensor) {
+  return tensor.is_conj();
+}
+
+inline Tensor conj(const Tensor& tensor) {
+  return tensor.conj();
+}
+
+inline bool is_neg(const Tensor& tensor) {
+  return tensor.is_neg();
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/LazyIr.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/LazyIr.h
new file mode 100644
index 0000000000000000000000000000000000000000..9237d2ee720548f20334b3856c6879daa5064b7c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/LazyIr.h
@@ -0,0 +1,19 @@
+#pragma once
+
+// This file contains autogenerated LazyTensor IR nodes
+${lazy_ir_sysinc}
+${lazy_ir_inc}
+
+${namespace_prologue}
+using at::operator<<;
+
+// kNullValue is used to contribute a static hash value any time
+// a node has an Optional<Value> input that is nullopt.  It is important
+// to differentiate between HASH(nullopt, something) and HASH(something, nullopt),
+// and using kNullValue in the hash function in the order of arguments
+// serves this purpose.
+static const torch::lazy::Value kNullValue = torch::lazy::Value();
+
+${ir_declarations}
+
+${namespace_epilogue}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/LazyNonNativeIr.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/LazyNonNativeIr.h
new file mode 100644
index 0000000000000000000000000000000000000000..df0f621c9620d3075a23a1af2da621d79cdb712f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/LazyNonNativeIr.h
@@ -0,0 +1,11 @@
+#pragma once
+
+${lazy_non_native_ir_inc}
+
+// This file contains autogenerated LazyTensor Non Native IR nodes
+
+${namespace_prologue}
+
+${non_native_ir_nodes}
+
+${namespace_epilogue}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/MethodOperators.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/MethodOperators.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6fe7ba41de0850908d3de589363a58cd97cf0ce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/MethodOperators.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,             \
+  meaning the file will need to be re-compiled every time an operator      \
+  is changed or added. Consider if your change would be better placed in   \
+  another file, or if a more specific header might achieve the same goal.  \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+${MethodOperators_includes}
+
+namespace at {
+namespace _ops {
+${MethodOperators_declarations}
+} // namespace _ops
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeFunction.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeFunction.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d1bdce8b52cfc2e5ed8cf7d615c611006811220
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeFunction.h
@@ -0,0 +1,17 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+${extra_includes}
+
+${native_function_declarations}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeFunctions.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c49aa71be660658ac266b07c18c8468f245c9bba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeFunctions.h
@@ -0,0 +1,33 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the      \
+  file will need to be re-compiled every time an operator is changed or added.  \
+  Consider including a specific operator from <ATen/ops/{my_operator}_native.h> \
+  and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+${NativeFunctions_includes}
+
+${NativeFunctions_declarations}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeMetaFunction.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeMetaFunction.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9b8dc9276113be6618f220e520f6a319dd07f9d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeMetaFunction.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+${meta_function_declarations}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeMetaFunctions.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeMetaFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed628e1656dcb541d3bfe07e3df179d0bcbbf3e1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/NativeMetaFunctions.h
@@ -0,0 +1,19 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorIterator.h>
+
+${NativeMetaFunctions_includes}
+
+namespace at {
+
+namespace meta {
+
+${NativeMetaFunctions_declarations}
+
+} // namespace meta
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Operator.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..821ae3b76529c53dfc96db299a7ecac02657ea3c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Operator.h
@@ -0,0 +1,18 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+${declarations}
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Operators.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Operators.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9eaff50501a2f3487566eedbfe4cfc33b26c3594
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Operators.cpp
@@ -0,0 +1,19 @@
+#include <ATen/Tensor.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+
+// ${generated_comment}
+// NOTE See [Sharded File] comment in VariableType
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+${operator_headers}
+#endif
+
+${static_dispatch_extra_headers}
+
+namespace at { namespace _ops {
+
+${definitions}
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Operators.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4ff5dc101c6764563301310036d48a24bc3c6cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/Operators.h
@@ -0,0 +1,74 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,             \
+  meaning the file will need to be re-compiled every time an operator      \
+  is changed or added. Consider if your change would be better placed in   \
+  another file, or if a more specific header might achieve the same goal.  \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from <ATen/ops/{my_operator}_ops.h>   \
+  and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/core/QScheme.h>
+#include <c10/util/OptionalArrayRef.h>
+#include <tuple>
+#include <vector>
+
+${Operators_includes}
+
+// Extension writers: do you write wrapper functions? Are you frustrated with
+// resolving overloads of operators? Are you frustrated with dealing with
+// pointer-to-methods and resolving overloads of pointer-to-methods?? Look no
+// further, this is the utility for you.
+//
+// Given an operator schema: aten::op.overload(...
+//
+// Use ATEN_FN2(op, overload) to get a *function* version of the operator
+// that is guaranteed to not be overloaded. This means that you can safely
+// decltype(&ATEN_FN2(op, overload)) it. NB: the 2 means this macro takes 2 args.
+//
+// Given an operator schema without an overload name: aten::op(...
+//
+// Use ATEN_FN(op) to get an unambiguous *function* version of the operator.
+//
+// There is some interesting behavior for out= operations.
+// ATEN_FN2(sin, out) gives a function that is *faithful* to the schema;
+// that is, the order of arguments is exactly what it looks like in the schema.
+
+#define ATEN_FN2(op_name, overload) at::_ops::op_name##_##overload::call
+#define ATEN_FN(op_name) at::_ops::op_name::call
+
+// Separately, ATEN_OP(op) and ATEN_OP2(op, overload) define a class containing compile-time
+// metadata about a given aten operator.
+// Notable data on the class includes:
+// - ATEN_OP2(add, Tensor)::name // returns the string name: "add"
+// - ATEN_OP2(add, Tensor)::overload_name // returns the string overload name: "Tensor"
+// - ATEN_OP2(add, Tensor)::schema // returns the C++ schema type: at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &)
+// - ATEN_OP2(add, Tensor)::schema_str // returns the string jit type: "add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
+
+#define ATEN_OP2(op_name, overload) at::_ops::op_name##_##overload
+#define ATEN_OP(op_name) at::_ops::op_name
+
+// WARNING: Please do not call any of the ops in the _ops namespace directly.
+// Use the ATEN_FN macros. We do not guarantee stability of the naming
+// scheme for the functions in at::_ops
+
+// See Note [The ATen Operators API] for details of the at::_ops namespace
+
+namespace at {
+namespace _ops {
+${Operators_declarations}
+} // namespace _ops
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..800ae1bc4cc0411ba4b673daec15328fa14aae8e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.cpp
@@ -0,0 +1,15 @@
+// ${generated_comment}
+
+#include <ATen/RedispatchFunctions.h>
+#include <ATen/Functions.h>
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/op_registration/adaption.h>
+
+namespace at {
+
+namespace redispatch {
+    ${function_redispatch_definitions}
+} // namespace redispatch
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..b928da8f8236ef341ddfbc88d86988630bb8739b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.h
@@ -0,0 +1,32 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider using the at::_ops::{name}::redispatch() interface by including     \
+  the specific operator from <ATen/ops/{my_operator}_ops.h>
+#endif
+
+#include <c10/core/Scalar.h>
+#include <ATen/Tensor.h>
+#include <c10/core/Storage.h>
+#include <ATen/core/Generator.h>
+#include <c10/util/Deprecated.h>
+#include <ATen/DeviceGuard.h>
+#include <c10/core/TensorOptions.h>
+#include <ATen/core/Reduction.h>
+#include <c10/util/Optional.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Context.h>
+#include <ATen/TracerMode.h>
+#include <ATen/Operators.h>
+
+namespace at {
+
+namespace redispatch {
+    ${function_redispatch_definitions}
+} // namespace redispatch
+
+}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterBackendSelect.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterBackendSelect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69d7d80e84ea5258eb09b022f75bca67c102ec9e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterBackendSelect.cpp
@@ -0,0 +1,54 @@
+// We register ops with a higher priority dispatch key (BackendSelect) than the usual backend-specific keys (e.g. CPU)
+// which makes calls to the factory functions dispatch to here.
+// We then 'manually' compute a lower-priority to re-dispatch to (e.g. CPU) to get to the eventually correct backend.
+// ${generated_comment}
+
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/core/dispatch/DispatchKeyExtractor.h>
+#include <torch/library.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+#include <ATen/ops/is_pinned_ops.h>
+#include <ATen/ops/_pin_memory_ops.h>
+
+${ops_headers}
+#endif
+
+namespace at {
+
+namespace {
+
+${backend_select_method_definitions}
+
+bool is_pinned(const Tensor& self, c10::optional<at::Device> device) {
+  // Only CPU tensors can be pinned
+  if (!self.is_cpu()) {
+    return false;
+  }
+  // TODO: fetch scalar type from Tensor? But it doesn't really matter...
+  DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA)));
+  return at::_ops::is_pinned::redispatch(_dk, self, device);
+}
+
+at::Tensor _pin_memory(const Tensor& self, c10::optional<at::Device> device) {
+  TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned");
+  DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA)));
+  if (self.is_nested()) {
+    constexpr auto nested_key_set = c10::DispatchKeySet(
+        {c10::DispatchKey::NestedTensor, c10::DispatchKey::AutogradNestedTensor});
+    _dk = _dk.add(self.key_set() & nested_key_set);
+  }
+  return at::_ops::_pin_memory::redispatch(_dk, self, device);
+}
+
+TORCH_LIBRARY_IMPL(aten, BackendSelect, m) {
+  ${backend_select_function_registrations};
+  m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), TORCH_FN(is_pinned));
+  m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), TORCH_FN(_pin_memory));
+}
+
+} // namespace
+} // at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterCodegenUnboxedKernels.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterCodegenUnboxedKernels.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52adaeec74da5905fbc1ffbfaccd7454577892b9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterCodegenUnboxedKernels.cpp
@@ -0,0 +1,41 @@
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/register_ops_utils.h>
+
+#include <ATen/UnboxingFunctions.h>
+
+// ${generated_comment}
+
+// NOTE [Sharded File]: This file is generated in a sharded fashion to speed up
+// incremental rebuilds. See the comment at the top of
+// templates/VariableType.cpp for an analogous, in-depth discussion.
+//
+// Generated by tools/jit/gen_unboxing.py. This file registers all ATen ops into JIT op registry instead of c10
+// dispatcher. JIT op registry only takes boxed kernels, so we are calling unboxing functions in UnboxingFunctions.h
+// to cast arguments into C++ types (instead of IValue) and delegate to unboxed kernels.
+
+namespace torch { namespace jit {
+
+using autograd::Variable;
+using autograd::variable_list;
+using at::Scalar;
+using at::ScalarType;
+using at::Tensor;
+using at::TensorOptions;
+using at::DeviceGuard;
+
+using ::c10::fmap;
+using ::c10::filter;
+
+namespace {
+
+RegisterOperators reg({
+
+    // Generated operators
+    ${unboxed_ops}
+});
+
+} // anon namespace
+
+
+}} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterDispatchDefinitions.ini b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterDispatchDefinitions.ini
new file mode 100644
index 0000000000000000000000000000000000000000..6e8c5002563870e7a0709630257b09e43beda799
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterDispatchDefinitions.ini
@@ -0,0 +1,24 @@
+${ns_prologue}
+
+// NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid
+// ambiguity with conflicting identifiers that may have been defined in
+// at namespace already.
+namespace {
+
+${dispatch_helpers}
+
+${dispatch_anonymous_definitions}
+
+${static_init_dispatch_registrations}
+
+} // anonymous namespace
+
+${deferred_dispatch_registrations}
+
+namespace ${dispatch_namespace} {
+
+${dispatch_namespaced_definitions}
+
+} // namespace ${dispatch_namespace}
+
+${ns_epilogue}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterDispatchKey.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterDispatchKey.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3449f38055ed140b475834e73b69f4ae5e46a06
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterDispatchKey.cpp
@@ -0,0 +1,54 @@
+// required for old g++ to compile PRId64 macros, see
+// https://github.com/pytorch/pytorch/issues/3571
+// for context
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+// an external backend might generate file within its code tree
+// and check all the source files within the tree with clang-format.
+// so, disable it since the backend might have a different config.
+// clang-format off
+
+// NOTE: This condition is true for all PyTorch internal libraries, it
+//       just excludes external projects such as torch_xla which
+//       re-use some of the PyTorch codegen machinery.
+#if defined(CAFFE2_BUILD_MAIN_LIB)        || \
+    defined(TORCH_CUDA_BUILD_MAIN_LIB)    || \
+    defined(TORCH_HIP_BUILD_MAIN_LIB)     || \
+    defined(TORCH_CUDA_CU_BUILD_MAIN_LIB) || \
+    defined(TORCH_CUDA_CPP_BUILD_MAIN_LIB)
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#endif
+
+// ${generated_comment}
+
+#include <c10/core/TensorImpl.h>
+#include <c10/core/Allocator.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/Dispatch.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/Half.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/util/Optional.h>
+#include <ATen/Tensor.h>
+#include <ATen/native/Resize.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include <ATen/Config.h>
+#include <ATen/core/op_registration/adaption.h>
+#include <torch/library.h>
+$extra_cuda_headers
+$external_backend_headers
+$dispatch_headers
+$ops_headers
+
+// See template file RegisterDispatchDefinitions.ini
+$dispatch_definitions
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterFunctionalization.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterFunctionalization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc24071f1dfad5f71a63753d0c29de55e09a6444
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterFunctionalization.cpp
@@ -0,0 +1,110 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/FunctionalTensorWrapper.h>
+#include <ATen/FunctionalInverses.h>
+#include <ATen/MemoryOverlap.h>
+#include <torch/library.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#include <ATen/NativeFunctions.h>
+#else
+// needed for the meta tensor calls to get stride info in functionalization
+#include <ATen/ops/empty_strided_native.h>
+// needed for special handling of copy_().
+// See Note [functionalizating copy_() and not preserving strides]
+#include <ATen/ops/to_ops.h>
+#include <ATen/ops/expand_copy_ops.h>
+
+$ops_headers
+#endif
+
+namespace at {
+namespace functionalization {
+
+// This keyset is used by functionalization when it calls into meta kernels
+// to accurately propagate stride metadata.
+// Exclude any modes: the purpose of calling into meta kernels is only as an implementation
+// detail to perform shape inference, and we don't want any modal keys to run.
+// Specifically, we want to prevent functionalization and Python modes from running.
+constexpr auto exclude_keys_for_meta_dispatch =
+    c10::functorch_transforms_ks |
+    c10::DispatchKeySet({
+        c10::DispatchKey::FuncTorchDynamicLayerBackMode,
+        c10::DispatchKey::FuncTorchDynamicLayerFrontMode,
+        c10::DispatchKey::Python,
+        c10::DispatchKey::PreDispatch,
+
+    });
+
+// Helper around at::has_internal_overlap.
+// The ATen util is used in hot-path eager mode: it's always fast,
+// but might return TOO_HARD sometimes.
+// During functionalization, we're ok taking a bit longer
+// to detect memory overlap.
+inline bool has_internal_overlap_helper(const at::Tensor t) {
+  auto has_overlap = at::has_internal_overlap(t);
+  if (has_overlap == at::MemOverlap::Yes) return true;
+  if (has_overlap == at::MemOverlap::No) return false;
+  return false;
+}
+
+
+inline Tensor to_meta(const Tensor& t) {
+    if (!t.defined()) return t;
+    return at::native::empty_strided_meta_symint(t.sym_sizes(), t.sym_strides(),
+/*dtype=*/c10::make_optional(t.scalar_type()), /*layout=*/c10::make_optional(t.layout()),
+/*device=*/c10::make_optional(c10::Device(kMeta)), /*pin_memory=*/c10::nullopt);
+}
+
+inline c10::optional<Tensor> to_meta(const c10::optional<Tensor>& t) {
+  if (t.has_value()) {
+    return c10::make_optional<Tensor>(to_meta(*t));
+  }
+  return c10::nullopt;
+}
+
+inline std::vector<Tensor> to_meta(at::ITensorListRef t_list) {
+  std::vector<Tensor> outputs;
+  outputs.reserve(t_list.size());
+  for (const auto& tensor : t_list) {
+    outputs.push_back(to_meta(tensor));
+  }
+  return outputs;
+}
+
+inline c10::List<Tensor> to_meta(const c10::List<Tensor>& t_list) {
+  c10::List<Tensor> outputs;
+  outputs.reserve(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs.push_back(to_meta(t_list[i]));
+  }
+  return outputs;
+}
+
+inline c10::List<c10::optional<Tensor>> to_meta(const c10::List<c10::optional<Tensor>>& t_list) {
+  c10::List<c10::optional<Tensor>> outputs;
+  outputs.reserve(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs.push_back(to_meta(t_list[i]));
+  }
+  return outputs;
+}
+
+
+${func_definitions}
+
+}  // namespace functionalization
+
+namespace {
+
+TORCH_LIBRARY_IMPL(aten, Functionalize, m) {
+  ${func_registrations};
+}
+
+}  // namespace
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterSchema.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterSchema.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72bc8ed613c3626e9faeebe290832dc868d341f9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterSchema.cpp
@@ -0,0 +1,13 @@
+// ${generated_comment}
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <torch/library.h>
+
+namespace at {
+TORCH_LIBRARY(aten, m) {
+  ${aten_schema_registrations};
+  // Distributed Ops
+  // Implementations located in torch/csrc/jit/runtime/register_distributed_ops.cpp
+  m.def("get_gradients(int context_id) -> Dict(Tensor, Tensor)");
+}
+${schema_registrations}
+}  // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegistrationDeclarations.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegistrationDeclarations.h
new file mode 100644
index 0000000000000000000000000000000000000000..f645f271585b28724829d7ac2672fab582f18dcf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/RegistrationDeclarations.h
@@ -0,0 +1,4 @@
+// This file contains all native_functions that can be registered to
+// and the schema string that they should be registered with
+
+${registration_declarations}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/TensorBody.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/TensorBody.h
new file mode 100644
index 0000000000000000000000000000000000000000..38f37f1986f87245dd415784ec67e9769eb6b88f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/TensorBody.h
@@ -0,0 +1,753 @@
+#pragma once
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#include <c10/core/Device.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/QScheme.h>
+#include <c10/core/Stream.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/macros/Export.h>
+#include <ATen/core/CheckMemoryFormat.h>
+#include <ATen/core/DeprecatedTypePropertiesRegistry.h>
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/QuantizerBase.h>
+#include <c10/core/SymInt.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/core/TensorBase.h>
+
+
+#include <ATen/MethodOperators.h>
+
+namespace c10{
+template<class T> class List;
+template<class T> class IListRef;
+}
+namespace at {
+struct Generator;
+struct Type;
+class DeprecatedTypeProperties;
+class Tensor;
+} // namespace at
+namespace at {
+namespace indexing {
+struct TensorIndex;
+} // namespace indexing
+} // namespace at
+
+namespace torch { namespace autograd {
+
+struct Node;
+
+}} // namespace torch::autograd
+
+namespace at {
+
+class OptionalTensorRef;
+class TensorRef;
+class Tensor;
+using TensorList = ArrayRef<Tensor>;
+using ITensorList = c10::IListRef<Tensor>;
+
+using Stream = c10::Stream;
+
+// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which
+// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr.
+//
+// For example:
+//
+// void func(Tensor a) {
+//   Tensor b = a;
+//   ...
+// }
+//
+// In this example, when we say Tensor b = a, we are creating a new object that points to the
+// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the
+// destructor decrements the reference count by calling release() on the TensorImpl it points to.
+// The existing constructors, operator overloads, etc. take care to implement the correct semantics.
+//
+// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
+// special care must be taken to handle this.
+class TORCH_API Tensor: public TensorBase {
+ protected:
+  // Create a Tensor with a +0 reference count. Special care must be
+  // taken to avoid decrementing this reference count at destruction
+  // time. Intended to support MaybeOwnedTraits<Tensor>.
+  explicit Tensor(unsafe_borrow_t, const TensorBase& rhs): TensorBase(unsafe_borrow_t{}, rhs) {}
+  friend MaybeOwnedTraits<Tensor>;
+  friend OptionalTensorRef;
+  friend TensorRef;
+
+ public:
+  Tensor() = default;
+  // This constructor should not be used by end users and is an implementation
+  // detail invoked by autogenerated code.
+  explicit Tensor(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
+      : TensorBase(std::move(tensor_impl)) {}
+  Tensor(const Tensor &tensor) = default;
+  Tensor(Tensor &&tensor) = default;
+
+  // Implicitly move-constructible from TensorBase, but must be explicit to increase refcount
+  explicit Tensor(const TensorBase &base): TensorBase(base) {}
+  /*implicit*/ Tensor(TensorBase &&base): TensorBase(std::move(base)) {}
+
+  // Creates a new wrapper from TensorImpl. Intentionally a free method because
+  // it should be used with care. Checks necessary invariants
+  static Tensor wrap_tensor_impl(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl) {
+    return TensorBase::wrap_tensor_impl(std::move(tensor_impl));
+  }
+
+  Tensor contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const {
+    return TensorBase::contiguous(memory_format);
+  }
+
+  Tensor conj() const {
+    if (!this->is_complex()) {
+      return *this;
+    }
+
+    switch (this->layout()) {
+      case at::kSparse:
+      case at::kSparseCsr:
+      case at::kSparseCsc:
+      case at::kSparseBsr:
+      case at::kSparseBsc:
+        return this->conj_physical();
+      default:
+        return this->_conj();
+    }
+  }
+
+  // Aliased by Dimname overloads, so need explicit using
+  using TensorBase::size;
+  using TensorBase::sym_size;
+  using TensorBase::stride;
+
+  /// Should be used if *this can reasonably be expected to be contiguous and
+  /// performance is important.
+  /// Compared to contiguous, it saves a reference count
+  /// increment/decrement if *this is already contiguous, at the cost
+  /// in all cases of an extra pointer of stack usage, an extra branch
+  /// to access, and an extra branch at destruction time.
+  c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const &;
+
+  // Use .contiguous() instead. Trying to borrow from a prvalue Tensor
+  // will only lead to trouble and dangling references.
+  c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete;
+
+  // The following overloads are very intruiging.  Consider the following
+  // program:
+  //
+  //    x[1] = 3;
+  //
+  // We would expect that the first entry of x is written to 3.  But how can we
+  // actually achieve this?  x[1] evaluates to a tensor...
+  //
+  // The answer is, using a ref-qualifier.  x[1] is an rvalue, which cannot be
+  // (profitably) assigned to in the traditional sense, so we overload
+  // assignment to mean, "Actually, copy 3 into the tensor data."  This is done
+  // with an rvalue-reference ref-qualified overload (the methods with && at the
+  // end of their type.)
+  //
+  // There's one more fly in the ointment: We also want
+  //
+  //    Tensor x = y;
+  //
+  // to work, and we want it NOT to copy.  So we need a traditional operator=
+  // overload.  But we MUST specify a mutable lvalue ref-qualifier, to
+  // disambiguate the traditional overload from the rvalue-reference
+  // ref-qualified overload.  Otherwise, it will be ambiguous, because
+  // a non ref-qualified method is eligible for all situations.
+
+  // Unfortunately, we have to write these constructors out manually
+  // to work around an MSVC bug:
+  //    error C2580: 'at::Tensor &at::Tensor::operator =(const at::Tensor &) &':
+  //    multiple versions of a defaulted special member functions are not allowed
+  // Tensor& operator=(const Tensor&) & = default;
+  // Tensor& operator=(Tensor&&) & = default;
+
+  // Also MSVC will wrongly issue the following warning with the aforementioned fix
+  //    warning C4522: 'at::Tensor': multiple assignment operators specified
+  // Let's just skip the warning.
+  //
+  // TODO: temporarily disabled
+
+  Tensor& operator=(const TensorBase& x) & {
+    impl_ = x.getIntrusivePtr();
+    return *this;
+  }
+  Tensor& operator=(TensorBase&& x) & noexcept {
+    impl_ = x.unsafeReleaseIntrusivePtr();
+    return *this;
+  }
+
+  Tensor& operator=(const Tensor &x) & {
+    return operator=(static_cast<const TensorBase&>(x));
+  }
+  Tensor& operator=(Tensor &&x) & noexcept {
+    return operator=(static_cast<TensorBase&&>(x));
+  }
+
+  Tensor& operator=(const Scalar &v) && {
+    return fill_(v);
+  }
+  Tensor& operator=(const Tensor &rhs) && {
+    return copy_(rhs);
+  }
+  Tensor& operator=(Tensor&& rhs) && {
+    return copy_(rhs);
+  }
+
+  C10_DEPRECATED_MESSAGE("Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device().")
+  DeprecatedTypeProperties & type() const {
+    return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+        dispatchKeyToBackend(legacyExtractDispatchKey(key_set())),
+        scalar_type());
+  }
+
+  Tensor toType(ScalarType t) const {
+    return to(options().dtype(t), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  // TODO: Deprecate me
+  Tensor toBackend(Backend b) const {
+    return to(options().device(backendToDeviceType(b)).layout(layout_from_backend(b)), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  C10_DEPRECATED_MESSAGE("Tensor.is_variable() is deprecated; everything is a variable now. (If you want to assert that variable has been appropriately handled already, use at::impl::variable_excluded_from_dispatch())")
+  bool is_variable() const noexcept {
+    return !at::impl::variable_excluded_from_dispatch();
+  }
+
+  template<typename T>
+  C10_DEPRECATED_MESSAGE("Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead.")
+  T * data() const {
+    return data_ptr<T>();
+  }
+
+  template <typename T>
+  T item() const;
+
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+  C10_DEPRECATED_MESSAGE("packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead")
+  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {
+    return generic_packed_accessor<T,N,PtrTraits,index_t>();
+  }
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+  C10_DEPRECATED_MESSAGE("packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead")
+  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() && = delete;
+
+  Tensor operator~() const {
+    return bitwise_not();
+  }
+  Tensor operator-() const {
+    return neg();
+  }
+  Tensor& operator+=(const Tensor & other) {
+    return add_(other);
+  }
+  Tensor& operator+=(const Scalar & other) {
+    return add_(other);
+  }
+  Tensor& operator-=(const Tensor & other) {
+    return sub_(other);
+  }
+  Tensor& operator-=(const Scalar & other) {
+    return sub_(other);
+  }
+  Tensor& operator*=(const Tensor & other) {
+    return mul_(other);
+  }
+  Tensor& operator*=(const Scalar & other) {
+    return mul_(other);
+  }
+  Tensor& operator/=(const Tensor & other) {
+    return div_(other);
+  }
+  Tensor& operator/=(const Scalar & other) {
+    return div_(other);
+  }
+  Tensor& operator&=(const Tensor & other) {
+    return bitwise_and_(other);
+  }
+  Tensor& operator|=(const Tensor & other) {
+    return bitwise_or_(other);
+  }
+  Tensor& operator^=(const Tensor & other) {
+    return bitwise_xor_(other);
+  }
+  Tensor operator[](const Scalar & index) const {
+    if (!index.isIntegral(false)) {
+      TORCH_CHECK_INDEX(false, "Can only index tensors with integral scalars");
+    }
+    return this->operator[](index.toLong());
+  }
+  Tensor operator[](const Tensor & index) const {
+    // These properties are checked in the Scalar constructor, but we already
+    // check them here to provide more useful diagnostics for the user.
+    if (!index.defined()) {
+      TORCH_CHECK_INDEX(false, "Can only index with tensors that are defined");
+    }
+    if (index.dim() != 0) {
+      TORCH_CHECK_INDEX(false,
+                        "Can only index with tensors that are scalars (zero-dim)");
+    }
+    // The Scalar(Tensor) constructor is explicit, so we need to call it.
+    return this->operator[](index.item());
+  }
+  Tensor operator[](int64_t index) const {
+    return select(0, index);
+  }
+
+  Tensor index(ArrayRef<at::indexing::TensorIndex> indices) const;
+  Tensor index(std::initializer_list<at::indexing::TensorIndex> indices) const;
+
+  Tensor & index_put_(ArrayRef<at::indexing::TensorIndex> indices, Tensor const & rhs);
+  Tensor & index_put_(ArrayRef<at::indexing::TensorIndex> indices, const Scalar& v);
+  Tensor & index_put_(std::initializer_list<at::indexing::TensorIndex> indices, Tensor const & rhs);
+  Tensor & index_put_(std::initializer_list<at::indexing::TensorIndex> indices, const Scalar& v);
+
+  Tensor cpu() const {
+    return to(options().device(c10::DeviceType::CPU), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  // TODO: The Python version also accepts arguments
+  Tensor cuda() const {
+    return to(options().device(c10::DeviceType::CUDA), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor hip() const {
+    return to(options().device(c10::DeviceType::HIP), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor ve() const {
+    return to(options().device(c10::DeviceType::VE), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor vulkan() const {
+    return to(options().device(c10::DeviceType::Vulkan), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor metal() const {
+    return to(options().device(c10::DeviceType::Metal), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor meta() const {
+    return to(options().device(c10::DeviceType::Meta), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  // ~~~~~ Autograd API ~~~~~
+
+  /// \fn bool is_leaf() const;
+  ///
+  /// All Tensors that have `requires_grad()` which is ``false`` will be leaf Tensors by convention.
+  ///
+  /// For Tensors that have `requires_grad()` which is ``true``, they will be leaf Tensors if they were
+  /// created by the user. This means that they are not the result of an operation and so
+  /// `grad_fn()` is `nullptr`.
+  ///
+  /// Only leaf Tensors will have their `grad()` populated during a call to `backward()`.
+  /// To get `grad()` populated for non-leaf Tensors, you can use `retain_grad()`.
+  ///
+  /// Example:
+  /// @code
+  /// auto a = torch::rand(10, torch::requires_grad());
+  /// std::cout << a.is_leaf() << std::endl; // prints `true`
+  ///
+  /// auto b = torch::rand(10, torch::requires_grad()).to(torch::kCUDA);
+  /// std::cout << b.is_leaf() << std::endl; // prints `false`
+  /// // b was created by the operation that cast a cpu Tensor into a cuda Tensor
+  ///
+  /// auto c = torch::rand(10, torch::requires_grad()) + 2;
+  /// std::cout << c.is_leaf() << std::endl; // prints `false`
+  /// // c was created by the addition operation
+  ///
+  /// auto d = torch::rand(10).cuda();
+  /// std::cout << d.is_leaf() << std::endl; // prints `true`
+  /// // d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
+  ///
+  /// auto e = torch::rand(10).cuda().requires_grad_();
+  /// std::cout << e.is_leaf() << std::endl; // prints `true`
+  /// // e requires gradients and has no operations creating it
+  ///
+  /// auto f = torch::rand(10, torch::device(torch::kCUDA).requires_grad(true));
+  /// std::cout << f.is_leaf() << std::endl; // prints `true`
+  /// // f requires grad, has no operation creating it
+  /// @endcode
+
+  /// \fn void backward(const Tensor & gradient={}, c10::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const;
+  ///
+  /// Computes the gradient of current tensor with respect to graph leaves.
+  ///
+  /// The graph is differentiated using the chain rule. If the tensor is
+  /// non-scalar (i.e. its data has more than one element) and requires
+  /// gradient, the function additionally requires specifying ``gradient``.
+  /// It should be a tensor of matching type and location, that contains
+  /// the gradient of the differentiated function w.r.t. this Tensor.
+  ///
+  /// This function accumulates gradients in the leaves - you might need to
+  /// zero them before calling it.
+  ///
+  /// \param gradient Gradient w.r.t. the
+  ///     tensor. If it is a tensor, it will be automatically converted
+  ///     to a Tensor that does not require grad unless ``create_graph`` is True.
+  ///     None values can be specified for scalar Tensors or ones that
+  ///     don't require grad. If a None value would be acceptable then
+  ///     this argument is optional.
+  /// \param retain_graph If ``false``, the graph used to compute
+  ///     the grads will be freed. Note that in nearly all cases setting
+  ///     this option to True is not needed and often can be worked around
+  ///     in a much more efficient way. Defaults to the value of
+  ///     ``create_graph``.
+  /// \param create_graph If ``true``, graph of the derivative will
+  ///     be constructed, allowing to compute higher order derivative
+  ///     products. Defaults to ``false``.
+  /// \param inputs Inputs w.r.t. which the gradient will be accumulated into
+  ///     ``at::Tensor::grad``. All other Tensors will be ignored. If not
+  ///     provided, the gradient is accumulated into all the leaf Tensors
+  ///     that were used to compute the current tensor.
+  ///     When inputs are provided and a given input is not a leaf,
+  ///     the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients).
+  ///     It is an implementation detail on which the user should not rely.
+  ///     See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.
+  void backward(const Tensor & gradient={}, c10::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const {
+    // NB: Adding this wrapper to _backward here because we'd like our
+    // 'backwards' api to accept the 'inputs' argument optionally. Since code gen
+    // currently does not support optional of TensorList our approach is to replace
+    // backward in native_functions.yaml with _backward and call it here instead.
+    if (inputs.has_value()) {
+      TORCH_CHECK(inputs.value().size() > 0, "'inputs' argument to backward cannot be empty")
+      this->_backward(inputs.value(), gradient, retain_graph, create_graph);
+    } else {
+      this->_backward({}, gradient, retain_graph, create_graph);
+    }
+  }
+
+  /// \fn Tensor detach() const;
+  ///
+  /// Returns a new Tensor, detached from the current graph.
+  /// The result will never require gradient.
+
+  /// \fn Tensor & detach_() const;
+  ///
+  /// Detaches the Tensor from the graph that created it, making it a leaf.
+  /// Views cannot be detached in-place.
+
+  /// \fn void retain_grad() const;
+  ///
+  /// Enables this Tensor to have their :attr:`grad` populated during
+  /// :func:`backward`. This is a no-op for leaf tensors.
+
+  /// \fn bool retains_grad() const;
+  ///
+  /// Is ``true`` if this Tensor is non-leaf and its :attr:`grad` is enabled to be
+  /// populated during :func:`backward`, ``false`` otherwise.
+
+  const Tensor& set_requires_grad(bool requires_grad) const {
+    TensorBase::set_requires_grad(requires_grad);
+    return *this;
+  }
+
+  /// Return a mutable reference to the gradient. This is conventionally
+  /// used as `t.grad() = x` to set a gradient to a completely new tensor.
+  /// Note that this function work with a non-const Tensor and is not
+  /// thread safe.
+  Tensor& mutable_grad() const {
+    return impl_->mutable_grad();
+  }
+
+  /// This function returns an undefined tensor by default and returns a defined tensor
+  /// the first time a call to `backward()` computes gradients for this Tensor.
+  /// The attribute will then contain the gradients computed and future calls
+  /// to `backward()` will accumulate (add) gradients into it.
+  const Tensor& grad() const {
+    const Tensor& maybe_grad = impl_->grad();
+    if (!is_leaf() && !retains_grad() && !maybe_grad.defined()) {
+      TORCH_WARN(
+        "The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad "
+        "attribute won't be populated during autograd.backward(). If you indeed want the .grad "
+        "field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. "
+        "If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor "
+        "instead. See github.com/pytorch/pytorch/pull/30531 for more informations.");
+    }
+    return maybe_grad;
+  }
+
+  // The Forward AD API functions below are low level and are not to be used by end
+  // users who should use the API provided in torch/csrc/autograd.h
+
+  /// This function returns the forward gradient for this Tensor at the given level.
+  const Tensor& _fw_grad(uint64_t level) const {
+    return impl_->_fw_grad(level, *this);
+  }
+
+  /// This function can be used to set the value of the forward grad.
+  /// Note that the given new_grad might not be used directly if it has different
+  /// metadata (size/stride/storage offset) compared to this Tensor. In that case,
+  /// new_grad content will be copied into a new Tensor
+  void _set_fw_grad(const TensorBase& new_grad, uint64_t level, bool is_inplace_op) const {
+    impl_->_set_fw_grad(new_grad, *this, level, is_inplace_op);
+  }
+
+
+  // STOP.  Thinking of adding a method here, which only makes use
+  // of other ATen methods?  Define it in native_functions.yaml.
+
+  //example
+  //Tensor * add(Tensor & b);
+  ${tensor_method_declarations}
+
+  // Special C++ only overloads for std()-like functions (See gh-40287)
+  // These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
+  // So, for example std(0) would select the std(unbiased=False) overload
+
+  Tensor var(int dim) const {
+    return var(IntArrayRef{dim});
+  }
+
+  Tensor std(int dim) const {
+    return std(IntArrayRef{dim});
+  }
+
+  // We changed .dtype() to return a TypeMeta in #12766. Ideally, we want the
+  // at::kDouble and its friends to be TypeMeta's, but that hasn't happened yet.
+  // Before that change, we make this method to maintain BC for C++ usage like
+  // `x.to(y.dtype)`.
+  // TODO: remove following two after at::kDouble and its friends are TypeMeta's.
+  inline Tensor to(caffe2::TypeMeta type_meta, bool non_blocking=false, bool copy=false) const {
+    return this->to(/*scalar_type=*/typeMetaToScalarType(type_meta), non_blocking, copy);
+  }
+  inline Tensor to(Device device, caffe2::TypeMeta type_meta, bool non_blocking=false, bool copy=false) const {
+    return this->to(device, /*scalar_type=*/typeMetaToScalarType(type_meta), non_blocking, copy);
+  }
+
+  template <typename F, typename... Args>
+  decltype(auto) m(F func, Args&&... params) const {
+    return func(*this, std::forward<Args>(params)...);
+  }
+
+  /// NOTE: This is similar to the legacy `.data()` function on `Variable`, and is intended
+  /// to be used from functions that need to access the `Variable`'s equivalent `Tensor`
+  /// (i.e. `Tensor` that shares the same storage and tensor metadata with the `Variable`).
+  ///
+  /// One notable difference with the legacy `.data()` function is that changes to the
+  /// returned `Tensor`'s tensor metadata (e.g. sizes / strides / storage / storage_offset)
+  /// will not update the original `Variable`, due to the fact that this function
+  /// shallow-copies the `Variable`'s underlying TensorImpl.
+  at::Tensor tensor_data() const {
+    return TensorBase::tensor_data();
+  }
+
+  /// NOTE: `var.variable_data()` in C++ has the same semantics as `tensor.data`
+  /// in Python, which create a new `Variable` that shares the same storage and
+  /// tensor metadata with the original `Variable`, but with a completely new
+  /// autograd history.
+  ///
+  /// NOTE: If we change the tensor metadata (e.g. sizes / strides /
+  /// storage / storage_offset) of a variable created from `var.variable_data()`, those
+  /// changes will not update the original variable `var`. In `.variable_data()`, we set
+  /// `allow_tensor_metadata_change_` to false to make such changes explicitly illegal,
+  /// in order to prevent users from changing metadata of `var.variable_data()`
+  /// and expecting the original variable `var` to also be updated.
+  at::Tensor variable_data() const {
+    return TensorBase::variable_data();
+  }
+
+  // Hooks
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  template <typename T>
+  using hook_return_void_t = std::enable_if_t<std::is_void<typename c10::invoke_result_t<T&, Tensor>>::value, unsigned>;
+  template <typename T>
+  using hook_return_var_t = std::enable_if_t<std::is_same<typename c10::invoke_result_t<T&, Tensor>, Tensor>::value, unsigned>;
+
+  /// Registers a backward hook.
+  ///
+  /// The hook will be called every time a gradient with respect to the Tensor is computed.
+  /// The hook should have one of the following signature:
+  /// ```
+  /// hook(Tensor grad) -> Tensor
+  /// ```
+  /// ```
+  /// hook(Tensor grad) -> void
+  /// ```
+  /// The hook should not modify its argument, but it can optionally return a new gradient
+  /// which will be used in place of `grad`.
+  ///
+  /// This function returns the index of the hook in the list which can be used to remove hook.
+  ///
+  /// Example:
+  /// @code
+  /// auto v = torch::tensor({0., 0., 0.}, torch::requires_grad());
+  /// auto h = v.register_hook([](torch::Tensor grad){ return grad * 2; }); // double the gradient
+  /// v.backward(torch::tensor({1., 2., 3.}));
+  /// // This prints:
+  /// // ```
+  /// //  2
+  /// //  4
+  /// //  6
+  /// // [ CPUFloatType{3} ]
+  /// // ```
+  /// std::cout << v.grad() << std::endl;
+  /// v.remove_hook(h);  // removes the hook
+  /// @endcode
+  template <typename T>
+  hook_return_void_t<T> register_hook(T&& hook) const;
+  template <typename T>
+  hook_return_var_t<T> register_hook(T&& hook) const;
+
+  // Variable methods
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  Tensor data() const {
+    return TensorBase::data();
+  }
+
+  void _backward(TensorList inputs, const c10::optional<Tensor>& gradient, c10::optional<bool> keep_graph, bool create_graph) const;
+
+  const Tensor& requires_grad_(bool _requires_grad=true) const {
+    TensorBase::requires_grad_(_requires_grad);
+    return *this;
+  }
+};
+
+namespace detail {
+// Helper creator for Tensor class which doesn't requires the users to pass
+// in an intrusive_ptr instead it just converts the argument passed to
+// requested intrusive_ptr type.
+template <typename T, typename... Args>
+Tensor make_tensor(Args&&... args) {
+  return Tensor(c10::make_intrusive<T>(std::forward<Args>(args)...));
+}
+
+} // namespace detail
+
+} // namespace at
+
+
+namespace at {
+${tensor_method_definitions}
+} // namespace at
+
+
+namespace c10 {
+template <>
+struct MaybeOwnedTraits<at::Tensor> {
+  using owned_type = at::Tensor;
+  using borrow_type = at::Tensor;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    // NOTE: this can be implemented without the special
+    // unsafe_borrow_t Tensor constructor as
+    //
+    // return borrow_type(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(from.unsafeGetTensorImpl()));
+    //
+    // but that hurts inlining due to the nullptr check in the
+    // Tensor(c10::intrusive_ptr<...>) constructor. We already know
+    // that from.impl_ isn't null because from is a valid Tensor, so
+    // we needn't do the check again. (using __builtin_assume can
+    // avoid this, but wouldn't be portable to MSVC.)
+    return borrow_type(borrow_type::unsafe_borrow_t{}, from);
+  }
+
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.unsafeReleaseTensorImpl();
+    // See above note: this can be implemented with public API
+    // similarly to createBorrow(), but that would hurt inlining.
+    lhs = borrow_type(borrow_type::unsafe_borrow_t{}, rhs);
+  }
+
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.unsafeReleaseTensorImpl(); // "leak" it, but it was already +0.
+  }
+
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return &borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
+    return true;
+  }
+};
+
+template <>
+struct ExclusivelyOwnedTraits<at::Tensor> {
+  using repr_type = at::Tensor;
+  using pointer_type = at::Tensor*;
+  using const_pointer_type = const at::Tensor*;
+
+  static repr_type nullRepr() {
+    return at::Tensor();
+  }
+
+  template <class... Args>
+  static repr_type createInPlace(Args&&... args) {
+    return at::Tensor(std::forward<Args>(args)...);
+  }
+
+  static repr_type moveToRepr(at::Tensor&& x) {
+    return std::move(x);
+  }
+
+  static void destroyOwned(at::Tensor& x) {
+    return ExclusivelyOwnedTraits<at::TensorBase>::destroyOwned(x);
+  }
+
+  static at::Tensor take(at::Tensor& x) {
+    return std::move(x);
+  }
+
+  static pointer_type getImpl(repr_type& x) {
+    return &x;
+  }
+
+  static const_pointer_type getImpl(const repr_type& x) {
+    return &x;
+  }
+};
+} // namespace c10
+
+namespace at {
+
+inline c10::MaybeOwned<Tensor> borrow_from_optional_tensor(
+    const c10::optional<Tensor>& opt) {
+  return opt.has_value()
+    ? c10::MaybeOwned<Tensor>::borrowed(*opt)
+    : c10::MaybeOwned<Tensor>::owned(std::in_place);
+}
+
+inline c10::MaybeOwned<Tensor> Tensor::expect_contiguous(MemoryFormat memory_format) const & {
+  if (is_contiguous(memory_format)) {
+    return c10::MaybeOwned<Tensor>::borrowed(*this);
+  } else {
+    return c10::MaybeOwned<Tensor>::owned(__dispatch_contiguous(memory_format));
+  }
+}
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/TensorMethods.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/TensorMethods.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a377103e158cbfe2e6f12cf69e21594ec9364f9d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/TensorMethods.cpp
@@ -0,0 +1,61 @@
+#include <c10/core/Scalar.h>
+#include <ATen/core/TensorBody.h>
+
+#include <c10/util/string_view.h>
+
+namespace at {
+
+namespace {
+
+// Verifies the requested type is the same as the Tensor's type.
+void check_type(const TensorBase& tensor, ScalarType type, c10::string_view type_name) {
+  TORCH_CHECK(
+      tensor.scalar_type() == type
+      || (isQIntType(tensor.scalar_type())
+          && toUnderlying(tensor.scalar_type()) == type),
+      "expected scalar type ", type_name, " but found ", tensor.scalar_type());
+}
+
+} // namespace
+
+#define DEFINE_CAST(T, name)                                         \
+   template <>                                                       \
+   TORCH_API const T* TensorBase::const_data_ptr() const {           \
+     check_type(*this, ScalarType::name, #name);                     \
+     return this->unsafeGetTensorImpl()->data_ptr_impl<T>();         \
+   }                                                                 \
+                                                                     \
+   template <>                                                       \
+   TORCH_API const T* TensorBase::const_data_ptr<const T>() const {  \
+     check_type(*this, ScalarType::name, #name);                     \
+     return this->unsafeGetTensorImpl()->data_ptr_impl<std::remove_const_t<T>>(); \
+   }                                                                 \
+                                                                     \
+   template <>                                                       \
+   TORCH_API T* TensorBase::mutable_data_ptr() const {               \
+     check_type(*this, ScalarType::name, #name);                     \
+     return this->unsafeGetTensorImpl()->mutable_data_ptr_impl<T>(); \
+   }                                                                 \
+                                                                     \
+   template <>                                                       \
+   TORCH_API T* TensorBase::data_ptr() const {                       \
+     return mutable_data_ptr<T>();                                   \
+   }                                                                 \
+
+ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CAST)
+ AT_FORALL_QINT_TYPES(DEFINE_CAST)
+ DEFINE_CAST(uint16_t, UInt16)
+ DEFINE_CAST(uint32_t, UInt32)
+ DEFINE_CAST(uint64_t, UInt64)
+ #undef DEFINE_CAST
+
+ #define DEFINE_ITEM(T, name)      \
+   template <>                     \
+   TORCH_API T Tensor::item() const { \
+     return item().to##name();     \
+   }
+
+ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ITEM)
+ #undef DEFINE_ITEM
+
+ } //namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCPU.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46e9f4eca41156ec4ea6a962f8d643e292165cd8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCPU.cpp
@@ -0,0 +1,19 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+
+namespace at {
+
+// NB: this is explicitly copied here (via codegen) rather than
+// included via NativeFunctions.h to avoid recompiling this file when
+// NativeFunctions.h changes
+namespace meta {
+${meta_declaration}
+}
+
+namespace native {
+${native_declaration}
+${native_definitions}
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCPUKernel.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCPUKernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6db4c0280bda7e46a6dd92ec09f3aab60f278c44
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCPUKernel.cpp
@@ -0,0 +1,14 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/ufunc/${name}.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/cpu/Loops.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/Dispatch.h>
+#include <c10/core/Scalar.h>
+
+namespace at {
+namespace native {
+${native_definitions}
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCUDA.cu b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCUDA.cu
new file mode 100644
index 0000000000000000000000000000000000000000..90cbe9d4add4ca094b5fa7661df6bc798556767d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCUDA.cu
@@ -0,0 +1,21 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/ufunc/${name}.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
+${cuda_headers}
+
+namespace at {
+
+// NB: this is explicitly copied here (via codegen) rather than
+// included via NativeFunctions.h to avoid recompiling this file when
+// NativeFunctions.h changes
+namespace meta {
+${meta_declaration}
+}
+
+namespace native {
+${native_declaration}
+${native_definitions}
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b564cac031754439e5b3e2dd0a2a2a694c1af504
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp
@@ -0,0 +1,35 @@
+#include <ATen/UnboxingFunctions.h>
+#include <ATen/Functions.h>
+
+#include <ATen/Tensor.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/interned_strings.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+namespace at {
+namespace unboxing {
+
+using ::c10::fmap;
+using ::c10::filter;
+using torch::jit::peek;
+using torch::jit::drop;
+using torch::jit::pack;
+using torch::jit::pop;
+
+// Generated function declaration
+${definitions}
+
+} // namespace unboxing
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..698fb032046497eb87882fa57fc71de1fd49537b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.h
@@ -0,0 +1,32 @@
+// ${generated_comment}
+
+// Generated by tools/jit/gen_unboxing.py. This file declares code generated boxed C++ functions for operators,
+// base off of native_functions.yaml (or similar yaml file with the same syntax). The definition of such a boxed
+// function will pop out IValues from the stack then convert them into the correct C++ types based on given schema. This
+// unboxing logic is an alternative to template-based metaprogramming unboxing.
+
+#pragma once
+
+#include <ATen/ATen.h>
+namespace at {
+namespace unboxing {
+namespace {
+
+template<typename T, size_t N>
+std::array<T, N> as_array(const c10::List<c10::IValue>& list) {
+    std::array<T, N> res;
+    AT_ASSERT(list.size() == N);
+    std::vector<T> vec;
+    for (c10::IValue elem : list) {
+        vec.push_back(elem.to<T>());
+    }
+    std::copy(vec.begin(), vec.end(), res.begin());
+    return res;
+}
+}  // namespace <anonymous>
+using Stack = std::vector<c10::IValue>;
+// Generated function declaration
+${declarations}
+
+} // namespace unboxing
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/aten_interned_strings.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/aten_interned_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..8af363bc783fe29a3b2c82444c8a06108a38f59f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/aten_interned_strings.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// ${generated_comment}
+
+#if defined(TORCH_ASSERT_NO_OPERATORS) || defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on native_functions.yaml,          \
+  meaning the file will need to be re-compiled every time an operator   \
+  is changed or added. Consider if including <ATen/core/symbol.h> for   \
+  the c10::Symbol class would be sufficient, or if your change would be \
+  better placed in another file.
+#endif
+
+// ATen symbols correspond exactly to operators defined in ATen. Every
+// symbol here corresponds exactly to an ATen operation defined in
+// native_functions.yaml; attributes are in one-to-one correspondence
+// with their ATen name.
+
+#define FORALL_ATEN_BASE_SYMBOLS(_) \
+${aten_symbols}
+
+#define FORALL_ATTR_BASE_SYMBOLS(_) \
+${attr_symbols}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/enum_tag.h b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/enum_tag.h
new file mode 100644
index 0000000000000000000000000000000000000000..39c8c0049e4b9e833481dfdfc896fd889a01ca5a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/ATen/templates/enum_tag.h
@@ -0,0 +1,10 @@
+#pragma once
+
+// ${generated_comment}
+
+namespace at {
+    // Enum of valid tags obtained from the entries in tags.yaml
+    enum class Tag {
+        ${enum_of_valid_tags}
+    };
+}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/BUILD.bazel b/MLPY/Lib/site-packages/torchgen/packaged/autograd/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..f4127325e0958e6884843e37efff282ab7af484d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/BUILD.bazel
@@ -0,0 +1,4 @@
+load("//:tools/bazel.bzl", "rules")
+load(":build.bzl", "define_targets")
+
+define_targets(rules = rules)
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/README.md b/MLPY/Lib/site-packages/torchgen/packaged/autograd/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0200bb9d56d4148d8befc61133552989eb09b947
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/README.md
@@ -0,0 +1,3 @@
+If you add a file to this directory, you **MUST** update
+`torch/CMakeLists.txt` and add the file as a dependency to
+the `add_custom_command` call.
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__init__.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87f5f3cb49c401fb148c37fb47b28fa84192405f
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/context.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/context.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..571f85b8e816f6504cc8c6533f9295af9570fe61
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/context.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_annotated_fn_args.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_annotated_fn_args.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec515d984f5b22377d1d2bd91d2de0be84e44d0c
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_annotated_fn_args.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_autograd.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_autograd.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aac71678d202ffd45120855371c121b34621d0da
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_autograd.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_autograd_functions.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_autograd_functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61b7f3907ac0299c26300ef06340d722055866e5
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_autograd_functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_inplace_or_view_type.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_inplace_or_view_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4846ff7a1154d8106b8ce0b3bb0408cdcce5a98
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_inplace_or_view_type.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_python_functions.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_python_functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb73619ee2a35c35c68cc7faef1525bd589ce20b
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_python_functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_trace_type.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_trace_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da0479ade888b1981d8612dbcc23ca3ca1d75a6b
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_trace_type.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_variable_factories.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_variable_factories.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e26dab8d6c31c9e021b47dd23412bccfada003b
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_variable_factories.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_variable_type.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_variable_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef50d22dd5d98f6fe2f5039bc91b199fd057142d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_variable_type.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_view_funcs.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_view_funcs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb69f47780f952dacdd88c61e6296b47eb98ba0f
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/gen_view_funcs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/load_derivatives.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/load_derivatives.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..921d153515950ee732a112d6fbfe1ae4a84bd8f5
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/packaged/autograd/__pycache__/load_derivatives.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/build.bzl b/MLPY/Lib/site-packages/torchgen/packaged/autograd/build.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..08071722d7cbb8af370d679de92c2624b1b205ad
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/build.bzl
@@ -0,0 +1,14 @@
+def define_targets(rules):
+    rules.py_library(
+        name = "autograd",
+        srcs = rules.glob(["*.py"]),
+        data = rules.glob([
+            "*.yaml",
+            "templates/*",
+        ]),
+        visibility = ["//:__subpackages__"],
+        deps = [
+            rules.requirement("PyYAML"),
+            "//torchgen",
+        ],
+    )
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/context.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..3279e7800641eb24cb54268172c4c6b7f8edd2cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/context.py
@@ -0,0 +1,31 @@
+import functools
+from typing import Callable
+
+from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI
+from torchgen.context import native_function_manager
+from torchgen.utils import T
+
+
+# Like tools.api.context.with_native_function, but for
+# NativeFunctionWithDifferentiabilityInfo.
+def with_native_function_with_differentiability_info(
+    func: Callable[[NFWDI], T]
+) -> Callable[[NFWDI], T]:
+    @functools.wraps(func)
+    def wrapper(f: NFWDI) -> T:
+        with native_function_manager(f.func):
+            return func(f)
+
+    return wrapper
+
+
+# Like the above but with an additional dispatch key string argument
+def with_native_function_with_differentiability_info_and_key(
+    func: Callable[[NFWDI, str], T]
+) -> Callable[[NFWDI, str], T]:
+    @functools.wraps(func)
+    def wrapper(f: NFWDI, key: str) -> T:
+        with native_function_manager(f.func):
+            return func(f, key)
+
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/deprecated.yaml b/MLPY/Lib/site-packages/torchgen/packaged/autograd/deprecated.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e0998acd3fc869df696acbfda28b6bab8f4779d4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/deprecated.yaml
@@ -0,0 +1,134 @@
+# Deprecated function signatures. These are exposed in Python, but not included
+# in the error message suggestions.
+
+- name: add(Tensor self, Scalar alpha, Tensor other) -> Tensor
+  aten: add(self, other, alpha)
+
+- name: add_(Tensor(a!) self, Scalar alpha, Tensor other) -> Tensor(a!)
+  aten: add_(self, other, alpha)
+
+- name: add(Tensor self, Scalar alpha, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  aten: add_out(out, self, other, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor
+  aten: addbmm(self, batch1, batch2, beta, alpha)
+
+- name: addbmm_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: addbmm_(self, batch1, batch2, beta, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addbmm_out(out, self, batch1, batch2, beta, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2) -> Tensor
+  aten: addbmm(self, batch1, batch2, beta, 1)
+
+- name: addbmm_(Scalar beta, Tensor(a!) self, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: addbmm_(self, batch1, batch2, beta, 1)
+
+- name: addbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addbmm_out(out, self, batch1, batch2, beta, 1)
+
+- name: addcdiv(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor
+  aten: addcdiv(self, tensor1, tensor2, value)
+
+- name: addcdiv_(Tensor(a!) self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor(a!)
+  aten: addcdiv_(self, tensor1, tensor2, value)
+
+- name: addcdiv(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addcdiv_out(out, self, tensor1, tensor2, value)
+
+- name: addcmul(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor
+  aten: addcmul(self, tensor1, tensor2, value)
+
+- name: addcmul_(Tensor(a!) self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor(a!)
+  aten: addcmul_(self, tensor1, tensor2, value)
+
+- name: addcmul(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addcmul_out(out, self, tensor1, tensor2, value)
+
+- name: addmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2) -> Tensor
+  aten: addmm(self, mat1, mat2, beta, alpha)
+
+- name: addmm_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor mat1, Tensor mat2) -> Tensor(a!)
+  aten: addmm_(self, mat1, mat2, beta, alpha)
+
+- name: addmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmm_out(out, self, mat1, mat2, beta, alpha)
+
+- name: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2) -> Tensor
+  aten: addmm(self, mat1, mat2, beta, 1)
+
+- name: addmm_(Scalar beta, Tensor(a!) self, Tensor mat1, Tensor mat2) -> Tensor(a!)
+  aten: addmm_(self, mat1, mat2, beta, 1)
+
+- name: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmm_out(out, self, mat1, mat2, beta, 1)
+
+- name: sspaddmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2) -> Tensor
+  aten: sspaddmm(self, mat1, mat2, beta, alpha)
+
+- name: sspaddmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2) -> Tensor
+  aten: sspaddmm(self, mat1, mat2, beta, 1)
+
+- name: addmv(Scalar beta, Tensor self, Scalar alpha, Tensor mat, Tensor vec) -> Tensor
+  aten: addmv(self, mat, vec, beta, alpha)
+
+- name: addmv_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor mat, Tensor vec) -> Tensor(a!)
+  aten: addmv_(self, mat, vec, beta, alpha)
+
+- name: addmv(Scalar beta, Tensor self, Scalar alpha, Tensor mat, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmv_out(out, self, mat, vec, beta, alpha)
+
+- name: addmv(Scalar beta, Tensor self, Tensor mat, Tensor vec) -> Tensor
+  aten: addmv(self, mat, vec, beta, 1)
+
+- name: addmv_(Scalar beta, Tensor(a!) self, Tensor mat, Tensor vec) -> Tensor(a!)
+  aten: addmv_(self, mat, vec, beta, 1)
+
+- name: addmv(Scalar beta, Tensor self, Tensor mat, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmv_out(out, self, mat, vec, beta, 1)
+
+- name: addr(Scalar beta, Tensor self, Scalar alpha, Tensor vec1, Tensor vec2) -> Tensor
+  aten: addr(self, vec1, vec2, beta, alpha)
+
+- name: addr_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor vec1, Tensor vec2) -> Tensor(a!)
+  aten: addr_(self, vec1, vec2, beta, alpha)
+
+- name: addr(Scalar beta, Tensor self, Scalar alpha, Tensor vec1, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addr_out(out, self, vec1, vec2, beta, alpha)
+
+- name: addr(Scalar beta, Tensor self, Tensor vec1, Tensor vec2) -> Tensor
+  aten: addr(self, vec1, vec2, beta, 1)
+
+- name: addr_(Scalar beta, Tensor(a!) self, Tensor vec1, Tensor vec2) -> Tensor(a!)
+  aten: addr_(self, vec1, vec2, beta, 1)
+
+- name: addr(Scalar beta, Tensor self, Tensor vec1, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addr_out(out, self, vec1, vec2, beta, 1)
+
+- name: baddbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor
+  aten: baddbmm(self, batch1, batch2, beta, alpha)
+
+- name: baddbmm_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: baddbmm_(self, batch1, batch2, beta, alpha)
+
+- name: baddbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: baddbmm_out(out, self, batch1, batch2, beta, alpha)
+
+- name: baddbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2) -> Tensor
+  aten: baddbmm(self, batch1, batch2, beta, 1)
+
+- name: baddbmm_(Scalar beta, Tensor(a!) self, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: baddbmm_(self, batch1, batch2, beta, 1)
+
+- name: baddbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: baddbmm_out(out, self, batch1, batch2, beta, 1)
+
+- name: sub(Tensor self, Scalar alpha, Tensor other) -> Tensor
+  aten: sub(self, other, alpha)
+
+- name: sub_(Tensor(a!) self, Scalar alpha, Tensor other) -> Tensor(a!)
+  aten: sub_(self, other, alpha)
+
+- name: sub(Tensor self, Scalar alpha, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  aten: sub_out(out, self, other, alpha)
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/derivatives.yaml b/MLPY/Lib/site-packages/torchgen/packaged/autograd/derivatives.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1ee6415a252fe11a1528c7ee421891e416f7ffeb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/derivatives.yaml
@@ -0,0 +1,3140 @@
+# Defines derivative formulas and Python signatures of methods on Variable
+#
+# Note about possibly confusing nomenclature: An 'output gradient' is the
+# gradient of an output of a forward function. Output gradients are used as
+# the inputs to backward functions. `grads` is a vector of output gradients,
+# and `grad == grads[0]`, in all the derivative formulas in this file.
+# An 'input gradient' is the gradient of an input to a forward function.
+# Input gradients are the outputs of backward functions, corresponding to the
+# input names included in the derivative formulas defined in this file.
+# Also, every time we talk computing "gradient" we actually mean computing
+# the vector jacobian product using the given 'output gradient' as the vector.
+#
+# Each entry consists of:
+#   - A 'name', which specifies the ATen name of the function you
+#     are defining derivatives for, and an argument specification.
+#   - An optional 'dispatch' entry which can be used to specify
+#     per-autograd dispatch key derivatives. If this entry is not
+#     specified, then the gradient entries will be taken as the
+#     default gradients (i.e. registered for every backward dispatch
+#     key). (see _test_autograd_multiple_dispatch for an example
+#     of how to register separate derivates for different dispatch keys).
+#     The list of allowed dispatch keys (in addition to 'Default' which
+#     represents the Autograd alias key) is torchgen/model.py:AUTOGRAD_KEYS.
+#   - One or more gradients entries, mapping differentiable input
+#     names to a formula specifying how to compute its gradient.
+#     Note that a single gradient entry can specify the gradient
+#     formula for multiple input names, by specifying a key
+#     "input1, input2" (see atan2 for an example).
+#   - An argument can be flagged as 'non_differentiable'.
+#   - Optional entry with key 'output_differentiability' and value a list of the
+#     same length as the number of outputs from the forward function. The list
+#     should contain only booleans, specifying whether each of the output Tensor
+#     is differentiable.
+#     If it is not specified for a function that returns multiple elements but
+#     uses `grad` instead of `grads[idx]`, then all but the first output will
+#     be marked as non-differentiable.
+#     If None of the output is differentiable, you can also add the function
+#     name to `gen_variable_type.py`'s `DONT_REQUIRE_DERIVATIVE` list.
+#
+# There are two cases for Tensor and TensorList arguments here:
+#   - If that argument is differentiable, in the sense that a gradient with respect
+#     to that argument could exist. You should either:
+#       - Specify the formula for that gradient
+#       - Specify not_implemented("function_name") as a formula to say that this is not
+#         implemented yet (but might be in the future and the user can request that on an issue)
+#   - If that argument is not differentiable, because it is not a floating point dtype or the
+#     function is not differentiable with respect to that argument  for
+#     example. You should either:
+#       - Do not specify any formula for this argument
+#       - Specify explicitly that this argument is "non_differentiable". Note that in this case,
+#         we trust you that this argument will never have requires_grad=True and it will be silently
+#         ignored if it does.
+#
+# If a function has out-of-place and in-place variants, then the derivative
+# definition for the in-place variant is optional. It will default to the
+# definition for the out-of-place variant. Note that _out variants are never
+# differentiable.
+#
+# Gradient expressions are standard C++ expressions operating on ATen
+# variables.  In a gradient expression, the following variables/functions
+# are in scope:
+#
+#   - 'grad', the gradient of the output (often spelled grad_output
+#     in Python) which we are going to left-multiply.
+#
+#     When a function returns multiple *differentiable* outputs,
+#     you can refer to the gradients of each outputs using 'grads',
+#     e.g., 'grads[0]', 'grads[1]'.
+#
+#     When a function returns multiple *differentiable* outputs that
+#     are named, you can refer to the gradients of each outputs using
+#     'grad_{name}', e.g., 'grad_x', 'grad_y'.
+#
+#     When a function returns *one* differentiable output (the
+#     first output) and some more nondifferentiable outputs,
+#     you MUST refer to the gradient of the differentiable output with
+#     'grad' (this case is special-cased in our code generation).
+#
+#     Note that the number of differentibale outputs can be modified by the
+#     'output_differentiability' entry (see above).
+#
+#     Across a differentiable function's derivatives set, it is not
+#     permitted to mix the use of "grad", "grads", and
+#     "grad_{name}". You must be consistent for that differentiable
+#     function.
+#
+#   - Any of the input arguments, tensor or non-tensor, including
+#     argument names that only appear in Declarations.yaml, e.g. 'output'.
+#
+#   - 'result', representing the result of evaluating the forward
+#     expression for ATen native function declarations. If the forward
+#     expression outputs a tuple, use 'resultX' instead to access the
+#     X-th entry
+#
+#   - 'grad_input_mask', a std::array<bool, n>, specifies which input
+#     gradients are actually needed.  For example, in the entry
+#     `input0, input1: foo(grad_input_mask)`, `grad_input_mask` is a size
+#     two array, where `grad_input_mask[0]` is true if `input0` requires
+#     grad, and `grad_input_mask[1]` is true if `input1` requires grad.
+#
+#     (NB: if your function computes gradient for a list of tensors,
+#     the `grad_input_mask` will only have a single entry for the list
+#     specifying if either zero or at least one tensor from the list requires
+#     grad.  If we want to support more fine-grained signalling,
+#     we'll need some alternate variable which is not a std::array)
+#
+#   - 'retain_variables', a bool which is true if a user has specified
+#     that saved variables should be retained in case the backwards is
+#     run again later.  This allows an optimization where we can
+#     destroy saved buffers if we know variables are not going to be retained,
+#     e.g., it is used by _cudnn_rnn
+#
+#   - `wrap_opt_if`, is a 2-argument function that accepts a tensor
+#     variable and a boolean condition that dictates whether to save that
+#     variable in a graph. The result of this function is `c10::optional<Tensor>`,
+#     and it is `c10::nullopt` when the condition evalutes to `false`,
+#     otherwise it is the variable wrapped in `c10::optional<Tensor>`.
+#     For example, wrap_opt_if(var_0, grad_input_mask[1] || grad_input_mask[2])
+#     would mean that `var_0` is saved as long as the second (grad_input_mask[1])
+#     or the third (grad_input_mask[2]) argument requires gradients.
+#     Another interpretation of this expression would read as `var_0` is needed
+#     in the backward computation of the second or the third argument.
+#     NOTE: the usage of `var_i.requires_grad()` in the conditional expression
+#     is not supported, use `grad_input_mask[i]` instead.
+#     NOTE: `wrap_opt_if` could be used to prevent saving redundant variables
+#     with multi-output backward formulas.
+#     See https://github.com/pytorch/pytorch/issues/97575 for more details
+#     on the issue.
+#
+# If you need a complex expression, e.g., with local variables,
+# write a _backward function in torch/csrc/autograd/FunctionsManual.cpp
+# and invoke it from here.  By the way, go read
+# https://github.com/zdevito/ATen/issues/163; this describes an
+# important hazard that occurs when porting backwards from Python to C++
+#
+# Double backwards gradient expressions can be somewhat confusing;
+# the most important thing to remember is: (1) you need to define a
+# derivative formula for every input, including inputs named things
+# like 'grad_output', and (2) the gradient to multiply with is always
+# called 'grad' (even though it really is a grad-grad).
+#
+# You can also add forward derivative definition by defining a formula for
+# a returned value (in general "result" if the name is not specified). This
+# formula works the same way as the backward one and advanced implementations
+# should also be placed in the FunctionsManual file.
+# This formula should compute a single Jacobian vector product using the (primal)
+# value of the argument "foo_p", its forward grad "foo_t" and the result of the
+# function as "result".
+# Note that the forward derivative can be automatically generated in two cases:
+#     - if your function is linear (NOT affine or multi-linear), then you can
+#       specify so by just using the string "auto_linear" for the formula.
+#     - if your function is applied element wise (and has a single input), you
+#       can specify so by just using the string "auto_element_wise" for the formula.
+#
+# Note that to avoid unpacking overhead, functions taking TensorList as inputs
+# will always have their forward grad formula called. This function is responsible
+# to check if any computation is needed and should return an undefined Tensor when
+# there is nothing to do. You can check "cat_forward" for a full example.
+#
+# NB: There are a number of gradient definitions in here which are bogus
+# (implemented using zeros_like).  These gradients are (hopefully) not
+# used by our frontend.  You MUST check the frontend code; search for
+# OpName.apply to see if it's still using a legacy Python style API.
+#
+# Note: Returning views.
+# The following cases exist:
+#     - If a function returns no view, it can have arbitrary outputs.
+#     - If a function return at least one Tensor that is a differentiable view
+#       of one of its input:
+#         - If there is only one differentiable output, this Tensor is marked as a
+#           differentiable view. (alias or transpose for example)
+#         - If there are more than one differentiable output, by default all the views are
+#           marked as differentiable views and created with allow_rebase_history=false.
+#           Meaning that any inplace operation on it will raise an error. (unbind for example)
+#
+#  Notes about undefined output gradients:
+#     All backward functions must support all combinations of undefined output
+#     gradient Tensors, where `grad[i].defined() == false`. Depending on the
+#     number of input and output grads your derivative formula uses, code
+#     generation may automatically add some level of undefined grad support,
+#     according to these three cases:
+#
+#       * 1 input grad and 1 output grad:
+#           Complete undefined grad support is automatically added, so you
+#           shouldn't have to think about it, unless there is a bug in the code
+#           generation.
+#
+#       * 1 input grad and multiple output grads:
+#           Undefined grad support is automatically added ONLY in the case where
+#           all output grads are undefined. You will have to add explicit support
+#           for cases where a subset of output grads is undefined.
+#
+#       * multiple input grads:
+#           No automatic support, so you will need to add it.
+#
+#     If your derivative formula uses more than one output grad, it is usually
+#     preferable to add undefined grad support in the backward function itself
+#     (if you're using one), rather than in the derivative formula in this file.
+#
+#     Undefined Tensors are created with the default constructor `at::Tensor()`.
+#     It is an efficient way to represent a Tensor filled with zeros because
+#     the Tensor holds no sizing information and no Storage data is allocated.
+#     But consequentially, Tensor operations cannot be performed on them.
+#     Therefore, your backward function should treat an undefined output grad as
+#     a zero, and it needs to be a special case.
+#
+#     If all output grads are undefined, then it should be correct for the
+#     backward function to return undefined input grads. Since we use the chain
+#     rule, output grads equal to zero should result in input grads equal to zero,
+#     unless there is some rare special case.
+#
+#     If a subset of output grads is undefined, then it may be acceptable for
+#     the backward function to return undefined input grads--it depends on the
+#     specific function, so you'll have to determine that yourself. If returning
+#     an undefined Tensor is correct for a given input grad, it is also logically
+#     correct to return a defined grad full of zeros, but that would not be
+#     preferable since it would be less efficient.
+#
+# NB: The parameter names here MUST be consistent with the parameter names
+# in native_functions.yaml
+- name: abs(Tensor self) -> Tensor
+  self: grad * self.sgn()
+  result: handle_r_to_c(result.scalar_type(), self_t.conj() * self_p.sgn())
+
+- name: acos(Tensor self) -> Tensor
+  self: grad * -((-self * self + 1).rsqrt()).conj()
+  result: auto_element_wise
+
+- name: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  self: handle_r_to_c(self.scalar_type(), grad)
+  other: handle_r_to_c(other.scalar_type(), maybe_multiply(grad, alpha.conj()))
+  result: self_t + maybe_multiply(other_t, alpha)
+
+- name: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  self: handle_r_to_c(self.scalar_type(), grad)
+  result: self_t.clone()
+
+- name: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  self: maybe_multiply(grad, beta.conj())
+  batch1: maybe_multiply(grad.unsqueeze(0).expand_symint({ batch1.sym_size(0), batch1.sym_size(1), batch2.sym_size(2) }).bmm(batch2.transpose(1, 2).conj()), alpha.conj())
+  batch2: maybe_multiply(batch1.transpose(1, 2).conj().bmm(grad.unsqueeze(0).expand_symint({ batch1.sym_size(0), batch1.sym_size(1), batch2.sym_size(2) })), alpha.conj())
+  result: maybe_multiply(self_t, beta) + maybe_multiply(batch1_t.bmm(batch2_p).sum(0), alpha) + maybe_multiply(batch1_p.bmm(batch2_t).sum(0), alpha)
+
+- name: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+  self: handle_r_to_c(self.scalar_type(), grad)
+  tensor1: handle_r_to_c(tensor1.scalar_type(), grad * (value / tensor2).conj())
+  tensor2: handle_r_to_c(tensor2.scalar_type(), -grad * (value * tensor1 / (tensor2 * tensor2)).conj())
+  result: self_t + maybe_multiply(tensor1_t / tensor2_p, value) - maybe_multiply(tensor2_t * (tensor1_p / tensor2_p) / tensor2_p, value)
+
+- name: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+  self: handle_r_to_c(self.scalar_type(), grad)
+  tensor1: handle_r_to_c(tensor1.scalar_type(), grad * (tensor2 * value).conj())
+  tensor2: handle_r_to_c(tensor2.scalar_type(), grad * (tensor1 * value).conj())
+  result: self_t + maybe_multiply(tensor1_t * tensor2_p, value) + maybe_multiply(tensor2_t * tensor1_p, value)
+
+- name: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  self: maybe_multiply(grad, beta.conj())
+  mat1: mm_mat1_backward(grad, mat2, mat1.sym_sizes(), mat1.sym_strides(), mat1.layout(), alpha)
+  mat2: mm_mat2_backward(grad, mat1, mat2.sym_sizes(), mat2.sym_strides(), mat2.layout(), alpha)
+  result: maybe_multiply(self_t, beta) + maybe_multiply(mat1_t.mm(mat2_p), alpha) + maybe_multiply(mat1_p.mm(mat2_t), alpha)
+
+- name: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  self: maybe_multiply(grad, beta)
+  mat1: mm_mat1_sparse_backward(grad, mat1, mat2, alpha)
+  mat2: mm_mat2_backward(grad, mat1, mat2.sym_sizes(), mat2.sym_strides(), mat2.layout(), alpha)
+
+- name: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  self: maybe_multiply(grad, beta.conj())
+  mat: maybe_multiply(grad.ger(vec.conj()), alpha.conj())
+  vec: maybe_multiply(mat.t().conj().mv(grad), alpha.conj())
+  result: maybe_multiply(self_t, beta) + maybe_multiply(mat_t.mv(vec_p), alpha) + maybe_multiply(mat_p.mv(vec_t), alpha)
+
+- name: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  self: maybe_multiply(grad, beta.conj())
+  vec1: maybe_multiply(grad.mv(vec2.conj()), alpha.conj())
+  vec2: maybe_multiply(grad.t().mv(vec1.conj()), alpha.conj())
+  result: maybe_multiply(self_t, beta) + maybe_multiply(vec1_t.outer(vec2_p), alpha) + maybe_multiply(vec1_p.outer(vec2_t), alpha)
+
+- name: affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor
+  theta: affine_grid_generator_backward_symint(grad, size, align_corners)
+
+- name: alias(Tensor(a) self) -> Tensor(a)
+  self: grad
+  result: self_t
+
+- name: angle(Tensor self) -> Tensor
+  self: angle_backward(grad, self)
+  result: handle_r_to_c(result.scalar_type(), angle_backward(self_t.conj(), self_p).conj())
+
+# The four items below are necessary because TensorIterator doesn't work on
+# Variables (codegen does not unwrap the input Tensor for all() and any() ).
+- name: any(Tensor self) -> Tensor
+  output_differentiability: [False]
+
+- name: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+  output_differentiability: [False]
+
+- name: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+  output_differentiability: [False]
+
+- name: _is_all_true(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: _is_any_true(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: all(Tensor self) -> Tensor
+  output_differentiability: [False]
+
+- name: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+  output_differentiability: [False]
+
+- name: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+  output_differentiability: [False]
+
+- name: acosh(Tensor self) -> Tensor
+# Save one rsqrt in the real case by using that for x real and positive sqrt(x*y) = sqrt(x)*sqrt(y) (not true in the complex case)
+  self: "self.is_complex() ? grad * ((self + 1).rsqrt() * (self - 1).rsqrt()).conj() : grad * (self * self - 1).rsqrt()"
+  result: auto_element_wise
+
+- name: acosh_(Tensor(a!) self) -> Tensor(a!)
+  self: not_implemented("inplace version of acosh")
+
+- name: asinh(Tensor self) -> Tensor
+  self: grad * (self.pow(2) + 1).rsqrt().conj()
+  result: auto_element_wise
+
+- name: asinh_(Tensor(a!) self) -> Tensor(a!)
+  self: not_implemented("inplace version of asinh")
+
+- name: atanh(Tensor self) -> Tensor
+  self: grad * 1 / (1 - self.pow(2)).conj()
+  result: auto_element_wise
+
+- name: atanh_(Tensor(a!) self) -> Tensor(a!)
+  self: not_implemented("inplace version of atanh")
+
+- name: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
+  self: as_strided_backward(grad, TensorGeometry(self), size, stride, storage_offset)
+  result: auto_linear
+
+- name: as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!)
+  self: as_strided_backward(grad, TensorGeometry(self), size, stride, storage_offset)
+  result: auto_linear
+
+- name: asin(Tensor self) -> Tensor
+  self: grad * (-self * self + 1).rsqrt().conj()
+  result: auto_element_wise
+
+- name: atan(Tensor self) -> Tensor
+  self: grad / (self * self + 1).conj()
+  result: auto_element_wise
+
+- name: atan2(Tensor self, Tensor other) -> Tensor
+  self, other: atan2_backward(grad, self, other, grad_input_mask)
+  result: (-self_p * other_t + other_p * self_t) / (self_p.pow(2) + other_p.pow(2))
+
+- name: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  self: maybe_multiply(grad, beta.conj())
+  batch1: maybe_multiply(grad.bmm(batch2.transpose(1, 2).conj()), alpha.conj())
+  batch2: maybe_multiply(batch1.transpose(1, 2).conj().bmm(grad), alpha.conj())
+  result: maybe_multiply(self_t, beta) + maybe_multiply(batch1_t.bmm(batch2_p), alpha) + maybe_multiply(batch1_p.bmm(batch2_t), alpha)
+
+- name: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
+  self: zeros_like(grad)
+  result: auto_element_wise
+
+- name: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  p: zeros_like(p)
+  result: self_t.zero_()
+
+- name: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.zero_()
+
+- name: bmm(Tensor self, Tensor mat2) -> Tensor
+  self: grad.bmm(mat2.transpose(1, 2).conj())
+  mat2: self.transpose(1, 2).conj().bmm(grad)
+  result: self_t.bmm(mat2_p) + self_p.bmm(mat2_t)
+
+- name: matmul(Tensor self, Tensor other) -> Tensor
+  self, other: matmul_backward(grad, self, other, grad_input_mask)
+
+- name: cat(Tensor[] tensors, int dim=0) -> Tensor
+  tensors: cat_tensors_backward(grad, to_args_sizes_symint(tensors), to_args_scalartypes(tensors), dim)
+  result: cat_jvp(tensors, dim)
+
+- name: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.zero_()
+
+- name: ceil(Tensor self) -> Tensor
+  self: zeros_like(grad)
+  result: auto_element_wise
+
+- name: cholesky(Tensor self, bool upper=False) -> Tensor
+  self: cholesky_backward(grad, upper, result)
+
+- name: linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info)
+  self: cholesky_backward(grad, upper, L)
+  L: cholesky_jvp(self_t, L, upper)
+
+- name: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor
+  self, input2: cholesky_solve_backward(grad, self, input2, result, upper, grad_input_mask)
+  result: cholesky_solve_jvp(result, input2_p, input2_t, self_t, upper)
+
+- name: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
+  self: cholesky_inverse_backward(grad, self, upper, result)
+  result: cholesky_inverse_jvp(self_p, self_t, result, upper)
+
+# For clamp, gradient is not defined at the boundaries. But empirically it's helpful
+# to be able to get gradient on min and max, so we return the subgradient 1 for these cases.
+- name: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
+  self: clamp_backward(grad, self, min, max)
+  min, max: clamp_backward_min_max(grad, self, min, max, grad_input_mask)
+  result: clamp_jvp(self_p, self_t, min_p, min_t, max_p, max_t)
+
+- name: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+  self: clamp_backward(grad, self, min, max)
+  result: auto_element_wise
+
+- name: clamp_min(Tensor self, Scalar min) -> Tensor
+  self: where(self >= min, grad, at::scalar_tensor(0., grad.options()))
+  result: auto_element_wise
+
+- name: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
+  self: where(self >= min, grad, at::scalar_tensor(0., grad.options()))
+  min: where(self < min, grad, at::scalar_tensor(0., grad.options()))
+  result: where(self_p >= min_p, self_t, min_t)
+
+- name: clamp_max(Tensor self, Scalar max) -> Tensor
+  self: where(self <= max, grad, at::scalar_tensor(0., grad.options()))
+  result: auto_element_wise
+
+- name: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
+  self: where(self <= max, grad, at::scalar_tensor(0., grad.options()))
+  max: where(self > max, grad, at::scalar_tensor(0., grad.options()))
+  result: where(self_p <= max_p, self_t, max_t)
+
+- name: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+  self: grad
+  result: auto_linear
+
+- name: _lazy_clone(Tensor self) -> Tensor
+  self: grad
+  result: auto_linear
+
+- name: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
+  self: _to_copy_backward(grad, self.options())
+  result: _to_copy(self_t, dtype, layout, device, pin_memory, non_blocking, memory_format)
+  # The condition is: if dtype is not nullopt, then isDifferentiableType(*dtype)
+  # (If dtype IS nullopt, we rely on the regular check that any input requires grad).
+  output_differentiability: ["!dtype || isDifferentiableType(*dtype)"]
+
+- name: _coalesce(Tensor self) -> Tensor
+  self: grad
+
+- name: complex(Tensor real, Tensor imag) -> Tensor
+  real: at::real(grad)
+  imag: at::imag(grad)
+  result: at::complex(real_t, imag_t)
+
+- name: polar(Tensor abs, Tensor angle) -> Tensor
+  abs, angle: polar_backward(grad, result)
+  result: at::complex(abs_t*angle_p.cos() - angle_t*abs_p*angle_p.sin(), abs_t*angle_p.sin() + angle_t*abs_p*angle_p.cos())
+
+- name: _conj(Tensor(a) self) -> Tensor(a)
+  self: grad.conj()
+  result: self_t.conj()
+
+- name: _neg_view(Tensor(a) self) -> Tensor(a)
+  self: grad.neg()
+  result: self_t._neg_view()
+
+- name: _conj_physical(Tensor self) -> Tensor
+  self: grad.conj_physical()
+  result: self_t.conj_physical()
+
+- name: conj_physical_(Tensor(a!) self) -> Tensor(a!)
+  self: grad.conj_physical()
+  result: self_t.conj_physical_()
+
+- name: copysign.Tensor(Tensor self, Tensor other) -> Tensor
+  self: copysign_tensor_self_backward(grad, self, result)
+  other: zeros_like(other)
+  result: copysign_tensor_self_backward(self_t, self_p, result)
+
+- name: copysign.Scalar(Tensor self, Scalar other) -> Tensor
+  self: copysign_tensor_self_backward(grad, self, result)
+  result: auto_element_wise
+
+- name: cos(Tensor self) -> Tensor
+  self: grad * -self.sin().conj()
+  result: auto_element_wise
+
+- name: cosh(Tensor self) -> Tensor
+  self: grad * self.sinh().conj()
+  result: auto_element_wise
+
+- name: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
+  output_differentiability: [False]
+
+- name: count_nonzero(Tensor self, int? dim=None) -> Tensor
+  output_differentiability: [False]
+
+- name: linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor
+  self: at::linalg_cross(other.conj(), grad, dim)
+  other: at::linalg_cross(grad, self.conj(), dim)
+  result: "at::linalg_cross(self_t, other_p, dim) + at::linalg_cross(self_p, other_t, dim)"
+
+- name: logcumsumexp(Tensor self, int dim) -> Tensor
+  self: logcumsumexp_backward(grad, self, result, dim)
+  result: logcumsumexp_jvp(self_p, self_t, dim)
+
+- name: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  self: cumprod_backward(grad.to(self.scalar_type()), self, dim, result)
+  result: "cumprod_jvp(self_t, self_p, result, dim).to(dtype.has_value() ? *dtype : self_p.scalar_type())"
+
+- name: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  self: cumsum_backward(grad.to(self.scalar_type()), dim)
+  result: auto_linear
+
+- name: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices)
+  self: cummaxmin_backward(grad, self, indices, dim)
+  values: self_t.gather(dim, indices)
+
+- name: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
+  self: cummaxmin_backward(grad, self, indices, dim)
+  values: self_t.gather(dim, indices)
+
+- name: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
+  self, weight, bias: "grad.defined() ? conv_tbc_backward(grad, self, weight, bias, pad) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
+  log_probs: _ctc_loss_backward(grad, log_probs, targets, input_lengths, target_lengths, result0, result1, blank, zero_infinity)
+
+- name: _ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
+  log_probs: _ctc_loss_backward(grad, log_probs, targets, input_lengths, target_lengths, result0, result1, blank, zero_infinity)
+
+- name: deg2rad(Tensor self) -> Tensor
+  self: deg2rad_backward(grad)
+  result: auto_element_wise
+
+- name: _linalg_det(Tensor A) -> (Tensor result, Tensor LU, Tensor pivots)
+  A: linalg_det_backward(grad, result, A, LU, pivots)
+  result: linalg_det_jvp(A_t, result, LU, pivots, A_p.is_contiguous() && !A_p.is_complex())
+  output_differentiability: [True, False, False]
+
+- name: _linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet, Tensor LU, Tensor pivots)
+  A: slogdet_backward(grad_sign, grad_logabsdet, A, sign, LU, pivots)
+  sign, logabsdet: slogdet_jvp(LU, pivots, A_t, sign, A_p.is_contiguous() && !A_p.is_complex())
+  output_differentiability: [True, True, False, False]
+
+- name: block_diag(Tensor[] tensors) -> Tensor
+  tensors: block_diag_backward(grad, to_args_sizes(tensors), to_args_scalartypes(tensors))
+  result: block_diag_jvp(tensors)
+
+- name: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
+  self: grad.diagonal(offset, dim1, dim2)
+  result: auto_linear
+
+- name: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)
+  self: diagonal_backward_symint(grad, self.sym_sizes(), offset, dim1, dim2)
+  result: auto_linear
+
+- name: diagonal_backward(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2) -> Tensor
+  grad_output: grad.diagonal(offset, dim1, dim2)
+  result: auto_linear
+
+- name: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
+  self: norm_backward(grad, self - other, p, result)
+  other: -norm_backward(grad, self - other, p, result)
+  result: norm_jvp(self_p - other_p, self_t - other_t, p, result, {}, false)
+
+# The backward formula is done in this order to improve numerical stability
+# of the higher order derivatives, see https://github.com/pytorch/pytorch/issues/43414
+# Note that we don't use "result" because saving it would be BC-breaking when it is used in an inplace operation later
+- name: div.Tensor(Tensor self, Tensor other) -> Tensor
+  self: div_tensor_self_backward(grad, other, self.scalar_type())
+  other: div_tensor_other_backward(grad, self, other)
+  result: (self_t - other_t * result) / other_p
+
+- name: div.Scalar(Tensor self, Scalar other) -> Tensor
+  self: div_tensor_self_backward(grad, other, self.scalar_type())
+  result: self_t / other
+
+- name: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+  self: div_tensor_self_backward(grad, other, self.scalar_type(), rounding_mode)
+  other: div_tensor_other_backward(grad, self, other, rounding_mode)
+  result: "rounding_mode.has_value() ? result.new_zeros_symint(result.sym_sizes()) : self_t / other_p - other_t * (self_p / other_p) / other_p"
+
+- name: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+  self: div_tensor_self_backward(grad, other, self.scalar_type(), rounding_mode)
+  result: "rounding_mode.has_value() ? result.new_zeros_symint(result.sym_sizes()) : self_t / other"
+
+- name: dot(Tensor self, Tensor tensor) -> Tensor
+  self: grad * tensor.conj()
+  tensor: grad * self.conj()
+  result: at::dot(self_t, tensor_p) + at::dot(self_p, tensor_t)
+
+- name: vdot(Tensor self, Tensor other) -> Tensor
+  self: grad.conj() * other
+  other: grad * self
+  result: at::vdot(self_t, other_p) + at::vdot(self_p, other_t)
+
+- name: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
+  self: _fused_dropout_backward(grad, result1, p)
+
+- name: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)
+  input: "GradMode::is_enabled() ? infinitely_differentiable_native_dropout_backward(grad, result1, (!train.has_value() || !train.value() ? 1 : (p == 1 ? 0.0 : 1.0 / (1.0 - p)))) : native_dropout_backward(grad, result1, (!train.has_value() || !train.value() ? 1 : (p == 1 ? 0.0 : 1.0 / (1.0 - p))))"
+  result0: "(!train.has_value() || train.value()) ? (p == 1 ? 0.0 : 1.0 / (1.0 - p)) * input_t * result1 : input_t"
+
+- name: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
+  grad_output: "native_dropout_double_backward(grad, grad_output, mask, scale)"
+  mask: 'not_implemented("native_dropout_backward: mask")'
+
+- name: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  self: zeros_like(self)
+  result: self_t.zero_()
+
+- name: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  self: zeros_like(self)
+  other: zeros_like(other)
+  result: self_t.zero_()
+
+- name: erf(Tensor self) -> Tensor
+  self: 2.0 / sqrt(M_PI) * exp(-(self.pow(2))) * grad
+  result: auto_element_wise
+
+- name: erfc(Tensor self) -> Tensor
+  self: -2.0 / sqrt(M_PI) * exp(-(self.pow(2))) * grad
+  result: auto_element_wise
+
+- name: special_erfcx(Tensor self) -> Tensor
+  self: (2.0 * self * result - 2.0 / sqrt(M_PI)) * grad
+  result: auto_element_wise
+
+- name: erfinv(Tensor self) -> Tensor
+  self: 0.5 * sqrt(M_PI) * exp(self.erfinv().pow(2)) * grad
+  result: auto_element_wise
+
+- name: exp(Tensor self) -> Tensor
+  self: grad * result.conj()
+  result: auto_element_wise
+
+- name: exp2(Tensor self) -> Tensor
+  self: grad * result.conj() * M_LN2
+  result: auto_element_wise
+
+- name: expm1(Tensor self) -> Tensor
+  self: grad * (result.conj() + 1)
+  result: auto_element_wise
+
+# TODO: this derivative is not SymInt safe, need sum_to support
+- name: expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
+  self: at::sum_to(grad, self.sym_sizes())
+  result: auto_linear
+
+- name: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.zero_()
+
+- name: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
+
+- name: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
+
+- name: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
+  self, scale, zero_point: "grad.defined() ? _fake_quantize_learnable_per_tensor_affine_backward(grad, self, scale, zero_point, quant_min, quant_max, grad_factor) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: fake_quantize_per_channel_affine_cachemask(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  self: fake_quantize_per_channel_affine_cachemask_backward(grad, mask)
+
+- name: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
+  self, scale, zero_point: "grad.defined() ? _fake_quantize_learnable_per_channel_affine_backward(grad, self, scale, zero_point, axis, quant_min, quant_max, grad_factor) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask)
+  self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
+
+- name: fill.Scalar(Tensor self, Scalar value) -> Tensor
+  self: zeros_like(grad)
+  result: at::fill(self_t, 0)
+
+- name: fill.Tensor(Tensor self, Tensor value) -> Tensor
+  self: zeros_like(grad)
+  value: grad.sum()
+  result: at::fill(self_t, value_t)
+
+- name: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.fill_(0)
+
+- name: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
+  self: zeros_like(grad)
+  value: grad.sum()
+  result: self_t.fill_(value_t)
+
+- name: floor(Tensor self) -> Tensor
+  self: zeros_like(grad)
+  result: auto_element_wise
+
+- name: fmod.Scalar(Tensor self, Scalar other) -> Tensor
+  self: grad
+  result: auto_element_wise
+
+- name: fmod.Tensor(Tensor self, Tensor other) -> Tensor
+  self: grad
+  other: -grad * self.div(other, /*rounding_mode=*/"trunc")
+  result: self_t - other_t * self_p.div(other_p, /*rounding_mode=*/"trunc")
+
+- name: frac(Tensor self) -> Tensor
+  self: grad
+  result: self_t
+
+- name: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
+  self: grad / exponent.exp2()
+  mantissa: self_t / exponent.exp2()
+
+- name: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+  self: gather_backward(grad, self, dim, index, sparse_grad)
+  index: non_differentiable
+  result: auto_linear
+
+- name: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  self: zeros_like(self)
+  result: self_t.zero_()
+
+- name: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  self: zeros_like(self)
+  other: zeros_like(other)
+  result: self_t.zero_()
+
+- name: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.zero_()
+
+- name: geqrf(Tensor self) -> (Tensor a, Tensor tau)
+  self: not_implemented("geqrf")
+
+- name: indices(Tensor(a) self) -> Tensor(a)
+  output_differentiability: [False]
+
+- name: _indices(Tensor(a) self) -> Tensor(a)
+  output_differentiability: [False]
+
+- name: crow_indices(Tensor(a) self) -> Tensor(a)
+  output_differentiability: [False]
+
+- name: col_indices(Tensor(a) self) -> Tensor(a)
+  output_differentiability: [False]
+
+- name: ccol_indices(Tensor(a) self) -> Tensor(a)
+  output_differentiability: [False]
+
+- name: row_indices(Tensor(a) self) -> Tensor(a)
+  output_differentiability: [False]
+
+- name: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  input, grid: "grad.defined() ? grid_sampler_2d_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners, grad_input_mask) : std::tuple<Tensor, Tensor>()"
+
+- name: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  input, grid: "grad.defined() ? grid_sampler_3d_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners, grad_input_mask) : std::tuple<Tensor, Tensor>()"
+
+# See NOTE [ grid_sample CPU fallback ]
+- name: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  input, grid: "grad.defined() ? _grid_sampler_2d_cpu_fallback_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners) : std::tuple<Tensor, Tensor>()"
+
+- name: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  self: zeros_like(self)
+  result: self_t.zero_()
+
+- name: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  self: zeros_like(self)
+  other: zeros_like(other)
+  result: self_t.zero_()
+
+- name: hardsigmoid(Tensor self) -> Tensor
+  self: hardsigmoid_backward(grad, self)
+  result: auto_element_wise
+
+- name: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
+  output_differentiability: [False]
+
+- name: hardswish(Tensor self) -> Tensor
+  self: hardswish_backward(grad, self)
+  result: auto_element_wise
+
+- name: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
+  grad_output: hardswish_backward(grad, self)
+  self: at::where(at::logical_and(-3.0 < self, self < 3.0), grad * grad_output / 3.0, at::zeros({}, self.options()))
+  result: "hardswish_backward(grad_output_t, self_p)
+         + at::where(at::logical_and(-3.0 < self_p, self_p < 3.0), self_t * grad_output_p / 3.0, at::zeros({}, self_p.options()))"
+
+- name: hypot(Tensor self, Tensor other) -> Tensor
+  self: grad * self / result
+  other: grad * other / result
+  result: self_t * self_p / result + other_t * other_p / result
+
+- name: i0(Tensor self) -> Tensor
+  self: grad * at::special_i1(self)
+  result: auto_element_wise
+
+- name: special_i0e(Tensor self) -> Tensor
+  self: grad * (at::special_i1e(self) - self.sgn() * result)
+  result: auto_element_wise
+
+- name: special_i1(Tensor self) -> Tensor
+  self: i1_backward(grad, self, result)
+  result: auto_element_wise
+
+- name: special_i1e(Tensor self) -> Tensor
+  self: i1e_backward(grad, self, result)
+  result: auto_element_wise
+
+- name: igamma(Tensor self, Tensor other) -> Tensor
+  self: 'not_implemented("igamma: input")'
+  other: grad * exp((self - 1) * log(other) - other - lgamma(self))
+
+- name: igammac(Tensor self, Tensor other) -> Tensor
+  self: 'not_implemented("igammac: input")'
+  other: -grad * exp((self - 1) * log(other) - other - lgamma(self))
+
+- name: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+  self: index_backward(grad.new_zeros_symint(self.sym_sizes(), self.options()), indices, grad)
+  result: auto_linear
+
+- name: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+  self: at::_unsafe_index_put(grad.new_zeros_symint(self.sym_sizes(), self.options()), indices, grad, true)
+  result: auto_linear
+
+- name: index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
+  self: grad
+  # The case source.dim() == 0  is necessary to support scalar tensors of the form
+  # source.dim() == 0 and index.dim() == 1 and index.size() == (1,),
+  # This is because source is not broadcastable to index, as source.dim() < index.dim()
+  source: "maybe_multiply(source.dim() > 0 ? grad.index_select(dim, index).expand_as(source) : grad.index_select(dim, index.squeeze(0)), alpha)"
+  index: non_differentiable
+  result: at::index_add(self_t, dim, index, maybe_multiply(source_t, alpha))
+
+- name: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor
+  self, source: index_reduce_backward(grad, self, dim, index, source, reduce, include_self, result)
+  index: non_differentiable
+
+- name: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
+  self: grad.index_fill(dim, index, 0)
+  # The case source.dim() == 0 is necessary to support scalar tensors of the form
+  # source.dim() == 0 and index.dim() == 1 and index.size() == (1,),
+  # This is because source is not broadcastable to index, as source.dim() < index.dim()
+  source: "source.dim() > 0 ? grad.index_select(dim, index).expand_as(source) : grad.index_select(dim, index.squeeze(0))"
+  index: non_differentiable
+  result: self_t.index_copy(dim, index, source_t)
+
+- name: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+  self: grad.index_fill(dim, index, 0)
+  index: non_differentiable
+  result: self_t.index_fill(dim, index, 0)
+
+- name: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
+  self: grad.index_fill(dim, index, 0)
+  value: grad.index_select(dim, std::get<0>(at::_unique(index, /*sorted=*/false))).sum()
+  index: non_differentiable
+  result: self_t.index_fill(dim, index, value_t)
+
+- name: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+  self: "accumulate ? grad : grad.index_put(indices, zeros_like(values), false)"
+  values: grad.index(indices)
+  result: self_t.index_put(indices, values_t, accumulate)
+
+- name: _unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+  self: "accumulate ? grad : at::_unsafe_index_put(grad, indices, zeros_like(values), false)"
+  values: at::_unsafe_index(grad, indices)
+  result: at::_unsafe_index_put(self_t, indices, values_t, accumulate)
+
+- name: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
+  self: "accumulate ? grad : grad.index_put(indices, zeros_like(values), false)"
+  values: grad.index(indices)
+  result: at::_index_put_impl_(self_t, indices, values_t, accumulate, unsafe)
+
+- name: index_select(Tensor self, int dim, Tensor index) -> Tensor
+  self: index_select_backward_symint(grad, self.sym_sizes(), dim, index)
+  index: non_differentiable
+  result: auto_linear
+
+- name: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
+  A: -at::matmul(inverse.mH(), at::matmul(grad, inverse.mH()))
+  inverse: -at::matmul(at::matmul(inverse, A_t), inverse)
+  output_differentiability: [True, False]
+
+- name: linalg_pinv.atol_rtol_tensor(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor
+  self: pinv_backward(grad, result, self)
+  result: pinv_jvp(self_p, result, self_t)
+
+- name: isnan(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+  self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
+  values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
+
+- name: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  self: zeros_like(self)
+  result: self_t.zero_()
+
+- name: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  self: zeros_like(self)
+  other: zeros_like(other)
+  result: self_t.zero_()
+
+- name: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
+  self: "weight.isComplex() ? grad * (1 - weight.conj().toComplexDouble()) : grad * (1 - weight.toDouble())"
+  end: grad * weight.conj()
+  result: at::lerp(self_t, end_t, weight)
+
+- name: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
+  self: grad * (1 - weight).conj()
+  end: grad * weight.conj()
+  weight: grad * (end - self).conj()
+  result: at::lerp(self_t, end_t, weight_p) + weight_t * (end_p - self_p)
+
+- name: lgamma(Tensor self) -> Tensor
+  self: grad * digamma(self)
+  result: auto_element_wise
+
+- name: digamma(Tensor self) -> Tensor
+  self: grad * polygamma(1, self)
+  result: auto_element_wise
+
+- name: polygamma(int n, Tensor self) -> Tensor
+  self: grad * polygamma(n + 1, self)
+  result: auto_element_wise
+
+- name: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
+  self: grad * polygamma(n + 1, self)
+  result: self_t.mul_(polygamma(n + 1, original_self_p))
+
+- name: log(Tensor self) -> Tensor
+  self: grad.div(self.conj())
+  result: auto_element_wise
+
+- name: log10(Tensor self) -> Tensor
+  self: grad / (self.conj() * 2.3025850929940456)
+  result: auto_element_wise
+
+- name: log1p(Tensor self) -> Tensor
+  self: log1p_backward(grad, self)
+  result: auto_element_wise
+
+- name: log2(Tensor self) -> Tensor
+  self: grad / (self.conj() * 0.6931471805599453)
+  result: auto_element_wise
+
+- name: logaddexp(Tensor self, Tensor other) -> Tensor
+  self: grad / (1 + exp(other - self)).conj()
+  other: grad / (1 + exp(self - other)).conj()
+  result: self_t / (1 + exp(other_p - self_p)) + other_t / (1 + exp(self_p - other_p))
+
+- name: logaddexp2(Tensor self, Tensor other) -> Tensor
+  self: grad / (1 + pow(2, other - self))
+  other: grad / (1 + pow(2, self - other))
+  result: self_t / (1 + pow(2, other_p - self_p)) + other_t / (1 + pow(2, self_p - other_p))
+
+# Note [Gradient formula for xlogy at x = 0, y <= 0]
+# x * log(y) is not defined at y <= 0, so we cannot even talk about differentiability
+# Now, xlogy(0, y) = 0 by definition.
+# This does not make it differentiable as it's not defined in a neighbourhood of a point
+# (0, y) when y <= 0.
+# Now, when a function is non-differentiable, sometimes we return "a relatively sensible value"
+# In this case, as per the discussion in https://github.com/pytorch/pytorch/issues/80770, we choose
+# this value to be zero, which is the directional derivative along the line {x = 0}.
+- name: xlogy.Tensor(Tensor self, Tensor other) -> Tensor
+  self: at::xlogy(grad, other).masked_fill((self == 0.) & (other <= 0.), 0.)
+  other: grad * self / other
+  result: at::xlogy(self_t, other_p).masked_fill((self_p == 0.) & (other_p <= 0.), 0.) + other_t * self_p / other_p
+
+- name: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor
+  other: grad * self / other
+  result: auto_element_wise
+
+- name: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor
+  self: "other.toDouble() > 0.
+          ? at::xlogy(grad,  other)
+          : at::xlogy(grad,  other).masked_fill(self == 0., 0.)"
+  result: auto_element_wise
+
+# See Note [Gradient formula for xlogy at x = 0, y <= 0]
+# Same here but with y <= -1
+- name: special_xlog1py(Tensor self, Tensor other) -> Tensor
+  self: at::special_xlog1py(grad,  other).masked_fill((self == 0.) & (other <= -1.), 0.)
+  other: grad * self / (other + 1)
+  result: at::special_xlog1py(self_t,  other_p).masked_fill((self_p == 0.) & (other_p <= -1.), 0.) + other_t * self_p / (other_p + 1)
+
+- name: special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor
+  other: grad * self / (other + 1)
+  result: auto_element_wise
+
+- name: special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor
+  self: "other.toDouble() > -1.
+          ? at::special_xlog1py(grad,  other)
+          : at::special_xlog1py(grad,  other).masked_fill(self == 0., 0.)"
+  result: auto_element_wise
+
+- name: special_zeta(Tensor self, Tensor other) -> Tensor
+  self: not_implemented("zeta")
+  other:  grad * -self * special_zeta(self + 1., other)
+
+- name: special_zeta.self_scalar(Scalar self, Tensor other) -> Tensor
+  other:  grad * -self * special_zeta(self.toDouble() + 1., other)
+
+- name: special_zeta.other_scalar(Tensor self, Scalar other) -> Tensor
+  self: not_implemented("zeta")
+
+- name: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.zero_()
+
+- name: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+  self: logsumexp_backward(grad, self, result, dim, keepdim)
+  result: logsumexp_jvp(self_p, self_t, dim, keepdim)
+
+- name: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
+  self, b: linalg_lstsq_backward(grad, self, b, grad_input_mask)
+  solution: linalg_lstsq_jvp(self_p, b_p, self_t, b_t)
+  output_differentiability: [True, False, False, False]
+
+- name: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  self: zeros_like(self)
+  result: self_t.zero_()
+
+- name: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  self: zeros_like(self)
+  other: zeros_like(other)
+  result: self_t.zero_()
+
+- name: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)
+  A: lu_factor_ex_backward(grad, LU, pivots, pivot)
+  LU: lu_factor_ex_jvp(A_t, LU, pivots, pivot)
+  output_differentiability: [True, False, False]
+
+- name: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
+  A: linalg_lu_backward(grad_L, grad_U, P, L, U, pivot)
+  L: std::get<0>(linalg_lu_jvp(A_t, P, L, U, pivot))
+  U: std::get<1>(linalg_lu_jvp(A_t, P, L, U, pivot))
+  output_differentiability: [False, True, True]
+
+- name: linalg_lu_solve(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False) -> Tensor
+  LU: linalg_lu_solve_LU(grad, LU, pivots, result, left, adjoint)
+  B: "at::linalg_lu_solve(LU, pivots, grad, left, !adjoint)"
+  result: linalg_lu_solve_jvp(result, LU_p, pivots, LU_t, B_t, left, adjoint)
+
+- name: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)
+  LU_data: lu_unpack_backward(grad_L, grad_U, LU_data.sym_size(-2), LU_data.sym_size(-1))
+  LU_pivots: non_differentiable
+  L: "LU_data_t.sym_size(-2) >= LU_data_t.sym_size(-1) ? LU_data_t.tril(-1) : LU_data_t.narrow_symint(-1, 0, LU_data_t.sym_size(-2)).tril(-1)"
+  U: "LU_data_t.sym_size(-1) >= LU_data_t.sym_size(-2) ? LU_data_t.triu() : LU_data_t.narrow_symint(-2, 0, LU_data_t.sym_size(-1)).triu()"
+  output_differentiability: [False, True, True]
+
+- name: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
+  self: grad.masked_fill(mask, 0)
+  mask: non_differentiable
+  result: self_t.masked_fill(mask, 0)
+
+- name: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
+  self: grad.masked_fill(mask, 0)
+  value: masked_fill_backward(grad, mask)
+  mask: non_differentiable
+  result: self_t.masked_fill(mask, value_t)
+
+- name: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
+  self: grad.masked_fill(mask, 0)
+  source: masked_scatter_backward_symint(grad, mask, source.sym_sizes())
+  mask: non_differentiable
+  result: self_t.masked_scatter(mask, source_t)
+
+- name: masked_scatter_backward(Tensor grad_output, Tensor mask, SymInt[] sizes) -> Tensor
+  grad_output: zeros_like(grad_output).masked_scatter(mask, grad)
+  mask: non_differentiable
+  result: masked_scatter_backward(grad_output_t, mask, grad_output_t.sizes())
+
+- name: masked_select(Tensor self, Tensor mask) -> Tensor
+  self: masked_select_backward(grad, self, mask)
+  mask: non_differentiable
+  result: auto_linear
+
+- name: linalg_matrix_exp(Tensor self) -> Tensor
+  self: linalg_matrix_exp_differential(self, grad, /*adjoint*/ true)
+  result: linalg_matrix_exp_differential(self_p, self_t, /*adjoint*/ false)
+
+- name: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
+  values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
+
+- name: max(Tensor self) -> Tensor
+  self: evenly_distribute_backward(grad, self, result)
+  result: evenly_read_jvp(self_t, self_p, result)
+
+- name: maximum(Tensor self, Tensor other) -> Tensor
+  self: at::where(self == other, grad / 2, grad).masked_fill_(self < other, 0)
+  other: at::where(self == other, grad / 2, grad).masked_fill_(self > other, 0)
+  result: other_t + at::where(self_p == other_p, at::scalar_tensor(0.5, result.options()), (self_p > other_p).to(result.scalar_type())) * (self_t - other_t)
+
+- name: fmax(Tensor self, Tensor other) -> Tensor
+  self: grad.masked_fill((self >= other).logical_or_(other.isnan()).logical_not_(), 0)
+  other: grad.masked_fill((self >= other).logical_or_(other.isnan()), 0)
+  result: other_t + (self_p > other_p).logical_or_(other_p.isnan()) * (self_t - other_t)
+
+- name: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  self: grad.expand_symint(self.sym_sizes()) / self.sym_numel()
+  result: auto_linear
+
+- name: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  self: mean_backward(grad, self.sym_sizes(), dim, self.sym_numel(), keepdim)
+  result: auto_linear
+
+- name: median(Tensor self) -> Tensor
+  self: evenly_distribute_backward(grad, self, result)
+  result: evenly_read_jvp(self_t, self_p, result)
+
+- name: nanmedian(Tensor self) -> Tensor
+  self: evenly_distribute_backward(grad, self, result)
+  result: evenly_read_jvp(self_t, self_p, result)
+
+# This is in theory incorrect in the following case:
+#   sorted list: [..., a, b, b, ..., b, b, c, ...] with median = b and the value
+#                            |                     at middle position of the
+#                            |                     list between two `b`s. E.g.,
+#                            |
+#                            ^the middle position
+# The gradient exists and is essentially 0 in this case.
+#
+# In case where the middle position is at the boundary of `b` range, e.g.,
+#   sorted list: [..., a, b, b, ..., b, b, c, ...]
+#                                       |
+#                                       ^the middle position
+# The backward implementation is correct in the sense that it returns the
+# subgradient on one side.
+- name: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
+  values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
+
+- name: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
+  values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
+
+- name: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
+  values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
+
+- name: min(Tensor self) -> Tensor
+  self: evenly_distribute_backward(grad, self, result)
+  result: evenly_read_jvp(self_t, self_p, result)
+
+- name: minimum(Tensor self, Tensor other) -> Tensor
+  self: at::where(self == other, grad / 2, grad).masked_fill_(self > other, 0)
+  other: at::where(self == other, grad / 2, grad).masked_fill_(self < other, 0)
+  result: other_t + at::where(self_p == other_p, at::scalar_tensor(0.5, result.options()), (self_p < other_p).to(result.scalar_type())) * (self_t - other_t)
+
+- name: fmin(Tensor self, Tensor other) -> Tensor
+  self: grad.masked_fill((self <= other).logical_or_(other.isnan()).logical_not_(), 0)
+  other: grad.masked_fill((self <= other).logical_or_(other.isnan()), 0)
+  result: other_t + (self_p <= other_p).logical_or_(other_p.isnan()) * (self_t - other_t)
+
+- name: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+  self: scale_grad_by_count(restore_reduced_dims(grad, dim, keepdim), restore_reduced_dims(result, dim, keepdim) == self, dim)
+  result: amaxamin_jvp(self_p, self_t, result, dim, keepdim)
+
+- name: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+  self: scale_grad_by_count(restore_reduced_dims(grad, dim, keepdim), restore_reduced_dims(result, dim, keepdim) == self, dim)
+  result: amaxamin_jvp(self_p, self_t, result, dim, keepdim)
+
+- name: mm(Tensor self, Tensor mat2) -> Tensor
+  self: mm_mat1_backward(grad, mat2, self.sym_sizes(), self.sym_strides(), self.layout(), 1)
+  mat2: mm_mat2_backward(grad, self, mat2.sym_sizes(), mat2.sym_strides(), mat2.layout(), 1)
+  result: at::mm(self_t, mat2_p) + at::mm(self_p, mat2_t)
+
+- name: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+  self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
+  values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
+
+- name: mul.Tensor(Tensor self, Tensor other) -> Tensor
+  self: mul_tensor_backward(grad, other, self.scalar_type())
+  other: mul_tensor_backward(grad, self, other.scalar_type())
+  result: other_t * self_p + self_t * other_p
+
+- name: mul.Scalar(Tensor self, Scalar other) -> Tensor
+  self: mul_tensor_backward(grad, other, self.scalar_type())
+  result: self_t * other
+
+- name: mv(Tensor self, Tensor vec) -> Tensor
+  self: grad.ger(vec.conj())
+  vec: self.conj().t().mv(grad)
+  result: mv(self_t, vec_p) + mv(self_p, vec_t)
+
+- name: mvlgamma(Tensor self, int p) -> Tensor
+  self: mvlgamma_backward(grad, self, p)
+  result: auto_element_wise
+
+- name: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
+  self: grad * at::isfinite(self)
+  result: auto_element_wise
+
+- name: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, eps)
+
+- name: _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, eps)
+
+- name: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, /*training=*/false, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, /*training=*/false, eps)
+
+- name: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, Tensor(), Tensor(), result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, Tensor(), Tensor(), result1, result2, training, eps)
+
+- name: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  input, weight, grad_out: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_out, running_mean, running_var, train, eps, save_mean, save_invstd, grad_input_mask)
+  save_mean: not_implemented("native_batch_norm_backward save_mean")
+  save_invstd: not_implemented("native_batch_norm_backward save_invstd")
+
+- name: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? native_layer_norm_backward_symint(grad, input, normalized_shape, result1, result2, weight, bias, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: layer_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, result1, result2, normalized_shape)
+
+- name: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  input, weight, grad_out: layer_norm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_out, mean, rstd, normalized_shape, grad_input_mask)
+  bias: Tensor()
+  mean: not_implemented("native_layer_norm_backward mean")
+  rstd: not_implemented("native_layer_norm_backward rstd")
+
+- name: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
+  input, weight, bias: "GradMode::is_enabled() || grads[1].defined() || grads[2].defined() ? infinitely_differentiable_native_group_norm_backward(grads[0], grads[1], grads[2], input, result1, result2, weight, N, C, HxW, group, eps, grad_input_mask) : (grads[0].defined() ? native_group_norm_backward_symint(grads[0].device().is_xpu() ? grads[0] : grads[0].contiguous(grads[0].device().is_cpu() ? input.suggest_memory_format() : c10::MemoryFormat::Contiguous), input.device().is_xpu() ? input : input.contiguous(input.device().is_cpu() ? input.suggest_memory_format() : c10::MemoryFormat::Contiguous), result1, result2, weight, N, C, HxW, group, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>())"
+  result0: group_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, result1, result2, group)
+  result1: group_norm_mean_jvp(input_t, result1, group)
+  result2: group_norm_invstd_jvp(input_p, input_t, result1, result2, group)
+
+- name: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  self: zeros_like(self)
+  result: self_t.zero_()
+
+- name: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  self: zeros_like(self)
+  other: zeros_like(other)
+  result: self_t.zero_()
+
+- name: neg(Tensor self) -> Tensor
+  self: grad.neg()
+  result: auto_element_wise
+
+- name: nextafter(Tensor self, Tensor other) -> Tensor
+  self: not_implemented("nextafter")
+  other: not_implemented("nextafter")
+
+- name: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
+  self: norm_backward(grad, self, p, result)
+  result: norm_jvp(self_p, self_t, p, result)
+
+- name: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
+  self: norm_backward(grad, self, p, result, dim, keepdim)
+  result: norm_jvp(self_p, self_t, p, result, dim, keepdim)
+
+- name: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
+  self: norm_backward(grad, self.to(grad.scalar_type()), p, result)
+  result: norm_jvp(self_p, self_t, p, result)
+
+- name: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  self: norm_backward(grad, self.to(grad.scalar_type()), p, result, dim, keepdim)
+  result: norm_jvp(self_p, self_t, p, result, dim, keepdim)
+
+- name: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  self: linalg_vector_norm_backward(grad, self, ord, result, dim, keepdim)
+  result: linalg_vector_norm_jvp(self_p, self_t, ord, result, dim, keepdim)
+
+- name: _pdist_forward(Tensor self, float p=2) -> Tensor
+  self: _pdist_backward(grad, self, p, result)
+
+- name: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
+  grad: not_implemented("_pdist_backward")
+  self: not_implemented("_pdist_backward")
+  pdist: not_implemented("_pdist_backward")
+
+- name: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor
+  x1, x2: _euclidean_dist_backward(grad, x1, x2, result)
+
+- name: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
+  x1: _cdist_backward(grad.contiguous(), x1, x2, p, result)
+  x2: _cdist_backward(grad.mT().contiguous(), x2, x1, p, result.mT().contiguous())
+
+- name: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
+  grad: not_implemented("_cdist_backward")
+  x1: not_implemented("_cdist_backward")
+  x2: not_implemented("_cdist_backward")
+  cdist: not_implemented("_cdist_backward")
+
+- name: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.zero_()
+
+- name: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
+  mean: at::zeros_symint(mean.sym_sizes(), grad.options())
+  result: auto_element_wise
+
+- name: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
+  std: at::zeros_symint(std.sym_sizes(), grad.options())
+  result: auto_element_wise
+
+- name: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
+  mean: at::zeros_symint(mean.sym_sizes(), grad.options())
+  std: at::zeros_symint(std.sym_sizes(), grad.options())
+  result: zeros_like(mean_t)
+
+- name: linalg_householder_product(Tensor input, Tensor tau) -> Tensor
+  input, tau: householder_product_backward(grad, result, input, tau)
+  result: householder_product_jvp(input_t, tau_t, result, input_p, tau_p)
+
+- name: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor
+  self, input2, input3: ormqr_backward(grad, result, self, input2, input3, left, transpose, grad_input_mask)
+
+- name: permute(Tensor(a) self, int[] dims) -> Tensor(a)
+  self: permute_backwards(grad, dims)
+  result: auto_linear
+
+- name: poisson(Tensor self, Generator? generator=None) -> Tensor
+  self: zeros_like(self)
+  result: auto_element_wise
+
+- name: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+  self: pow_backward(grad, self, exponent)
+  result: auto_element_wise
+
+- name: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+  self: pow_backward_self(grad, self, exponent)
+  exponent: pow_backward_exponent(grad, self, exponent, result)
+  result: (pow_backward_self(self_t.conj(), self_p, exponent_p) + pow_backward_exponent(exponent_t.conj(), self_p, exponent_p, result)).conj()
+
+- name: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
+  exponent: pow_backward_exponent(grad, self, exponent, result)
+  result: auto_element_wise
+
+- name: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  self: prod_backward(grad, self.to(grad.scalar_type()), result)
+  result: (prod_backward(at::ones({}, result.options()).expand_as(result), self_p.to(result.scalar_type()), result) * self_t.conj()).sum().conj()
+
+- name: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  self: prod_backward(grad, self.to(grad.scalar_type()), result, dim, keepdim)
+  result: (prod_backward(at::ones({}, result.options()).expand_as(result), self_p.to(result.scalar_type()), result, dim, keepdim) * self_t.conj()).sum(dim, keepdim).conj()
+
+- name: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
+  self: "accumulate ? grad : grad.put(index, zeros_like(source), false)"
+  index: non_differentiable
+  source: grad.take(index).reshape_as(source)
+  result: self_t.put(index, source_t, accumulate)
+
+- name: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)
+  A: linalg_qr_backward(grad_Q, grad_R, Q, R, mode)
+  Q, R: linalg_qr_jvp(A_t, Q, R, mode)
+
+- name: rad2deg(Tensor self) -> Tensor
+  self: rad2deg_backward(grad)
+  result: auto_element_wise
+
+- name: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.zero_()
+
+- name: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.zero_()
+
+- name: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.zero_()
+
+- name: reciprocal(Tensor self) -> Tensor
+  self: -grad * (result * result).conj()
+  result: auto_element_wise
+
+- name: remainder.Scalar(Tensor self, Scalar other) -> Tensor
+  self: grad
+  result: auto_element_wise
+
+- name: remainder.Tensor(Tensor self, Tensor other) -> Tensor
+  self: grad
+  other: -grad * self.div(other, /*rounding_mode=*/"floor")
+  result: self_t - other_t * self_p.div(other_p, /*rounding_mode=*/"floor")
+
+- name: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
+  self: renorm_backward(grad, self, p, dim, maxnorm)
+  result: renorm_jvp(self_p, self_t, p, dim, maxnorm)
+
+- name: repeat(Tensor self, SymInt[] repeats) -> Tensor
+  self: repeat_backward(grad, repeats, self.sym_sizes())
+  result: auto_linear
+
+- name: special_entr(Tensor self) -> Tensor
+  self: grad * (-(1 + self.log()))
+  result: auto_element_wise
+
+- name: special_ndtri(Tensor self) -> Tensor
+  self: grad * std::sqrt(2 * M_PI) * (result.square() / 2).exp()
+  result: auto_element_wise
+
+- name: special_log_ndtr(Tensor self) -> Tensor
+  self: grad / std::sqrt(2 * M_PI) * (result + self.pow(2) / 2).neg().exp()
+  result: auto_element_wise
+
+# [Note: Sometimes view derivatives]
+# The following situation applies to other operations as well.
+# TODO: This note is only referenced by to_dense and to_sparse*. Make
+# this more generic if it's been referenced more than once.
+#
+# DO NOT define a backward for reshape!
+# reshape is special in that it sometimes returns a view, and sometimes not.
+# Defining a backward will make codegen spit out the forward call as
+#     as_variable(baseType->reshape(self)),
+# making it impossible (hard) to detect when it is actually a view.
+# - name: reshape(Tensor self, IntArrayRef shape)
+
+- name: _reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)
+  self: grad.reshape_symint(self.sym_sizes())
+  result: auto_linear
+
+- name: round(Tensor self) -> Tensor
+  self: zeros_like(grad)
+  result: auto_element_wise
+
+- name: round.decimals(Tensor self, *, int decimals) -> Tensor
+  self: zeros_like(grad)
+  result: auto_element_wise
+
+- name: rsqrt(Tensor self) -> Tensor
+  self: -0.5 * grad * result.pow(3).conj()
+  result: auto_element_wise
+
+- name: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+  self: grad.scatter(dim, index, 0)
+  index: non_differentiable
+  src: grad.gather(dim, index)
+  result: self_t.scatter(dim, index, src_t)
+
+- name: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+  self: grad.scatter(dim, index, 0)
+  index: non_differentiable
+  result: self_t.scatter(dim, index, 0)
+
+- name: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+  self: grad
+  index: non_differentiable
+  src: grad.gather(dim, index)
+  result: scatter_add(self_t, dim, index, src_t)
+
+- name: select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)
+  dispatch:
+    Default:
+      self: select_backward_symint(grad, self.sym_sizes(), dim, index)
+      result: auto_linear
+    AutogradNestedTensor:
+      self: _nested_select_backward_symint(grad, self, dim, index)
+
+- name: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
+  grad_output: grad.select_symint(dim, index)
+  result: auto_linear
+
+- name: sigmoid(Tensor self) -> Tensor
+  self: sigmoid_backward(grad, result)
+  result: auto_element_wise
+
+- name: logit(Tensor self, float? eps=None) -> Tensor
+  self: "GradMode::is_enabled() ? infinitely_differentiable_logit_backward(grad, self, eps) : logit_backward(grad, self, eps)"
+  result: auto_element_wise
+
+- name: sign(Tensor self) -> Tensor
+  self: zeros_like(grad)
+  result: auto_element_wise
+
+- name: sgn(Tensor self) -> Tensor
+  self: sgn_backward(self, grad, result)
+  # Cannot use auto_element_wise here because the Jacobian is *not* Hermitian (in fact, it is symmetric)
+  # The function is not holomorphic, so there's no reason for its Jacobian to be Hermitian
+  # auto_element_wise has a name that's a bit deceiving in the complex case
+  result: sgn_backward(self_p, self_t, result)
+
+- name: sin(Tensor self) -> Tensor
+  self: grad * self.cos().conj()
+  result: auto_element_wise
+
+- name: sinc(Tensor self) -> Tensor
+  self: sinc_backward(grad, self)
+  result: auto_element_wise
+
+- name: sinh(Tensor self) -> Tensor
+  self: grad * self.cosh().conj()
+  result: auto_element_wise
+
+- name: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+  self: slice_backward_wrapper(grad, self.sym_sizes(), dim, start, end, step)
+  result: auto_linear
+
+- name: slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor
+  grad_output: grad.slice_symint(dim, start, end, step)
+  result: auto_linear
+
+- name: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+  self: grad.slice_symint(dim, start, end, step)
+  src: slice_scatter_symint(grad, zeros_like(self), dim, start, end, step)
+  result: auto_linear
+
+- name: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+  self: slice_scatter_symint(grad, zeros_like(src), dim, start, end, step)
+  src: grad.slice_symint(dim, start, end, step)
+  result: auto_linear
+
+- name: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
+  self: select_scatter_symint(grad, zeros_like(src), dim, index)
+  src: grad.select_symint(dim, index)
+  result: auto_linear
+
+- name: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
+  self: diagonal_scatter(grad, zeros_like(src), offset, dim1, dim2)
+  src: grad.diagonal(offset, dim1, dim2)
+  result: auto_linear
+
+- name: as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+  self: as_strided_scatter_backward(grad, TensorGeometry(self), TensorGeometry(src), size, stride, storage_offset)
+  # See Note [as_strided_scatter backward support]
+  src: grad.contiguous().as_strided_symint(size, stride, storage_offset)
+  result: auto_linear
+
+- name: _linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor LU, Tensor pivots, Tensor info)
+  A, B: linalg_solve_backward(grad, result, A, LU, pivots, left, grad_input_mask[1])
+  result: "linalg_solve_jvp(A_t, B_t, result, LU, pivots, left, A_p.is_contiguous() && !A_p.is_complex())"
+  output_differentiability: [True, False, False, False]  # LU is an auxiliary tensor not exposed to the user
+
+- name: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+  self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), true)
+  output_differentiability: [True, False]
+  values: gather_with_keepdimed_indices(self_t, dim, indices, true)
+
+- name: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+  self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), true)
+  output_differentiability: [True, False]
+  values: gather_with_keepdimed_indices(self_t, dim, indices, true)
+
+- name: split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[]
+  self: split_backward(grads, split_size, dim, self.sym_sizes(), self.options())
+  result: auto_linear
+
+- name: unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+  self: split_backward(grads, split_size, dim, self.sym_sizes(), self.options())
+  result: auto_linear
+
+- name: split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[]
+  dispatch:
+    Default:
+      self: split_with_sizes_backward(grads, split_sizes, dim, self.sym_sizes(), self.options())
+      result: auto_linear
+    AutogradNestedTensor:
+      self: _nested_split_with_sizes_backward(grads, split_sizes, dim, at::native::get_nested_tensor_impl(self)->get_nested_sizes(), self.options())
+
+- name: unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+  self: split_with_sizes_backward(grads, split_sizes, dim, self.sym_sizes(), self.options())
+  result: auto_linear
+
+- name: sqrt(Tensor self) -> Tensor
+  self: grad / (2 * result.conj())
+  result: auto_element_wise
+
+- name: squeeze(Tensor(a) self) -> Tensor(a)
+  self: unsqueeze_to(grad, self.sym_sizes())
+  result: auto_linear
+
+- name: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
+  dispatch:
+    Default:
+      self: unsqueeze_to(grad, dim, self.sym_sizes())
+      result: auto_linear
+    AutogradNestedTensor:
+      self: grad.unsqueeze(dim)
+
+- name: squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a)
+  dispatch:
+    Default:
+      self: unsqueeze_to(grad, dim, self.sym_sizes())
+      result: auto_linear
+    AutogradNestedTensor:
+      self: unsqueeze_multiple(grad, dim, self.dim())
+
+- name: squeeze_(Tensor(a!) self) -> Tensor(a!)
+  self: unsqueeze_to(grad, self.sym_sizes())
+  result: auto_linear
+
+- name: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)
+  self: unsqueeze_to(grad, dim, self.sym_sizes())
+  result: auto_linear
+
+- name: squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!)
+  self: unsqueeze_to(grad, dim, self.sym_sizes())
+  result: auto_linear
+
+- name: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  self: std_backward(result, grad, self, dim, correction, keepdim)
+  # pointwise (variance) + sum + sqrt
+  result: (at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim)) / (2. * result)).masked_fill_(result == 0, 0)
+
+- name: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  self: std_mean_backward(grads[0], grads[1], self, result0, dim, correction, keepdim)
+  result0: (at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim)) / (2. * result0)).masked_fill_(result0 == 0, 0)
+  # linear
+  result1: mean(self_t, dim.value_or(IntArrayRef({})), keepdim)
+
+- name: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  self: handle_r_to_c(self.scalar_type(), grad)
+  other: handle_r_to_c(other.scalar_type(), maybe_multiply(-grad, alpha.conj()))
+  result: self_t - maybe_multiply(other_t, alpha)
+
+- name: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  self: handle_r_to_c(self.scalar_type(), grad)
+  result: auto_element_wise
+
+- name: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  self: handle_r_to_c(self.scalar_type(), maybe_multiply(-grad, alpha.conj()))
+  other: handle_r_to_c(other.scalar_type(), grad)
+  result: -maybe_multiply(self_t, alpha) + other_t
+
+- name: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  self: handle_r_to_c(self.scalar_type(), maybe_multiply(-grad, alpha.conj()))
+  result: auto_element_wise
+
+- name: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  self: grad.expand_symint(self.sym_sizes())
+  result: auto_linear
+
+- name: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  dispatch:
+    Default:
+      self: sum_backward(grad, self.sym_sizes(), dim, keepdim)
+      result: auto_linear
+    AutogradNestedTensor:
+      # TODO: replace this function once semantics for nested tensor expand have been settled on
+      self: _nested_sum_backward(grad, self, dim, keepdim)
+
+- name: nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  self: nansum_backward(grad.to(self.scalar_type()), self, dim, keepdim)
+  result: at::where(self_p.isnan(), 0, self_t).sum(dim, keepdim, dtype)
+
+# We never call _linalg_svd with compute_uv=False in an autograd context, so we don't even consider it here
+- name: _linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)
+  A: "svd_backward(full_matrices && grad_U.defined() ? grad_U.narrow_symint(-1, 0, S.sym_size(-1)) : grad_U,
+                   grad_S,
+                   full_matrices && grad_Vh.defined() ? grad_Vh.narrow_symint(-2, 0, S.sym_size(-1)) : grad_Vh,
+                   full_matrices ? U.narrow_symint(-1, 0, S.sym_size(-1)) : U,
+                   S,
+                   full_matrices ? Vh.narrow_symint(-2, 0, S.sym_size(-1)) : Vh)"
+  U, S, Vh: linalg_svd_jvp(A_t, U, S, Vh, full_matrices)
+
+- name: _linalg_eigh(Tensor A, str UPLO="L", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors)
+  A: linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors, /*is_hermitian=*/true)
+  eigenvalues, eigenvectors: linalg_eig_jvp(A_t, eigenvalues, eigenvectors, /*is_hermitian=*/true)
+
+- name: linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors)
+  self: handle_r_to_c(self.scalar_type(), linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors, /*is_hermitian=*/false))
+  eigenvalues, eigenvectors: linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/false)
+
+- name: t(Tensor(a) self) -> Tensor(a)
+  self: grad.t()
+  result: auto_linear
+
+- name: t_(Tensor(a!) self) -> Tensor(a!)
+  self: grad.t()
+  result: auto_linear
+
+- name: one_hot(Tensor self, int num_classes=-1) -> Tensor
+  self: non_differentiable
+
+- name: flip(Tensor self, int[] dims) -> Tensor
+  self: grad.flip(dims)
+  result: auto_linear
+
+- name: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
+  self: grad.roll_symint(fmap(reverse_list_symint(shifts), [](c10::SymInt i){return -i;}), reverse_list(dims))
+  result: auto_linear
+
+- name: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
+  self: grad.rot90(-k, dims)
+  result: auto_linear
+
+- name: take(Tensor self, Tensor index) -> Tensor
+  self: take_backward(grad, self, index)
+  index: non_differentiable
+  result: auto_linear
+
+- name: tan(Tensor self) -> Tensor
+  self: grad * (1 + result.pow(2)).conj()
+  result: auto_element_wise
+
+- name: tanh(Tensor self) -> Tensor
+  self: tanh_backward(grad, result)
+  result: auto_element_wise
+
+- name: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+  self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), true)
+  output_differentiability: [True, False]
+  values: gather(self_t, dim, indices)
+
+- name: trace(Tensor self) -> Tensor
+  self: trace_backward_symint(grad, self.sym_sizes())
+  result: auto_linear
+
+- name: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+  self: grad.transpose(dim0, dim1)
+  result: auto_linear
+
+- name: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+  self: grad.transpose(dim0, dim1)
+  result: auto_linear
+
+- name: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
+  self, A: triangular_solve_backward(grad_solution, grad_cloned_coefficient, self, A, solution, upper, transpose, unitriangular, grad_input_mask)
+  solution: triangular_solve_jvp(solution, A_p, A_t, self_t, upper, transpose, unitriangular)
+  cloned_coefficient: A_t
+
+- name: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor
+  self, B: linalg_solve_triangular_backward(grad, self, result, upper, left, unitriangular, grad_input_mask)
+  result: linalg_solve_triangular_forward_AD(self_t, B_t, self_p, result, upper, left, unitriangular)
+
+- name: tril(Tensor self, int diagonal=0) -> Tensor
+  self: grad.tril(diagonal)
+  result: auto_linear
+
+- name: triu(Tensor self, int diagonal=0) -> Tensor
+  self: grad.triu(diagonal)
+  result: auto_linear
+
+- name: trunc(Tensor self) -> Tensor
+  self: zeros_like(grad)
+  result: auto_element_wise
+
+# DO NOT define a backward for to_dense
+# See [Note: Sometimes view derivatives]
+# - name: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor
+#
+- name: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
+  self: to_dense_backward(grad, self, masked_grad)
+
+# DO NOT define a backward for to_sparse.sparse_dim
+# See [Note: Sometimes view derivatives]
+# - name: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+#
+- name: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+  self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
+
+# DO NOT define a backward for to_sparse
+# See [Note: Sometimes view derivatives]
+# - name: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+#
+- name: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+  self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
+
+# DO NOT define a backward for to_sparse_csr
+# See [Note: Sometimes view derivatives]
+# - name: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+#
+- name: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+  self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
+
+# DO NOT define a backward for to_sparse_csc
+# See [Note: Sometimes view derivatives]
+# - name: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+#
+- name: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+  self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
+
+# DO NOT define a backward for to_sparse_bsr
+# See [Note: Sometimes view derivatives]
+# - name: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+#
+- name: _to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+  self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
+
+# DO NOT define a backward for to_sparse_bsc
+# See [Note: Sometimes view derivatives]
+# - name: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+#
+- name: _to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+  self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
+
+- name: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
+  self: to_mkldnn_backward(grad, self)
+
+- name: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
+  self: unfold_backward_symint(grad, self.sym_sizes(), dimension, size, step)
+  result: auto_linear
+
+- name: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
+  grad_in: grad.unfold(dim, size, step)
+  result: auto_linear
+
+- name: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: self_t.zero_()
+
+- name: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)
+  output_differentiability: [True, False]
+  self: not_implemented("_unique")
+
+- name: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  output_differentiability: [True, False, False]
+  self: not_implemented("unique_dim")
+
+- name: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor)
+  output_differentiability: [True, False, False]
+  self: not_implemented("unique_consecutive")
+
+- name: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  output_differentiability: [True, False, False]
+  self: not_implemented("unique_dim_consecutive")
+
+- name: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  output_differentiability: [True, False, False]
+  self: not_implemented("_unique2")
+
+- name: _unsafe_view(Tensor self, SymInt[] size) -> Tensor
+  self: grad.reshape_symint(self.sym_sizes())
+  result: auto_linear
+
+- name: lift(Tensor self) -> Tensor
+  self: grad
+  result: auto_linear
+
+- name: lift_fresh(Tensor(a) self) -> Tensor(a)
+  self: grad
+  result: auto_linear
+
+- name: unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
+  self: grad.squeeze(dim)
+  result: auto_linear
+
+- name: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
+  self: grad.squeeze(dim)
+  result: auto_linear
+
+- name: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  self: var_backward(grad, self, dim, correction, keepdim)
+  # pointwise + sum
+  result: at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim))
+
+- name: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  self: var_mean_backward(grads[0], grads[1], self, dim, correction, keepdim)
+  result0: at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim))
+  # linear
+  result1: mean(self_t, dim.value_or(IntArrayRef({})), keepdim)
+
+- name: view(Tensor(a) self, SymInt[] size) -> Tensor(a)
+  dispatch:
+    Default:
+      self: grad.reshape_symint(self.sym_sizes())
+      result: auto_linear
+    AutogradNestedTensor:
+      self: grad.reshape_as(self)
+      result: auto_linear
+
+- name: view.dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)
+  output_differentiability: [False]
+
+- name: view_as_real(Tensor(a) self) -> Tensor(a)
+  self: at::view_as_complex(grad.contiguous()) # gx0 + 1j * gx1
+  result: at::view_as_real(self_t)
+
+- name: view_as_complex(Tensor(a) self) -> Tensor(a)
+  self: at::view_as_real(grad.contiguous().resolve_conj()) # [gx, gy]
+  result: at::view_as_complex(self_t)
+
+- name: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
+  condition: non_differentiable
+  self: where(condition, grad, 0)
+  other: where(condition, 0, grad)
+  result: where(condition, self_t, other_t)
+
+# weight_norm_cuda_interface_backward does not have an explicitly defined derivative, so if we do happen
+# to be running backward with create_graph=True, fall back to a backward function that uses
+# differentiable ops.
+- name: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
+  v, g: "grad.defined() ? (GradMode::is_enabled() ? _weight_norm_differentiable_backward(grad.contiguous(), v, g, result1, dim) : _weight_norm_interface_backward(grad.contiguous(), v, g, result1, dim)) : std::tuple<Tensor, Tensor>()"
+
+- name: zero_(Tensor(a!) self) -> Tensor(a!)
+  self: zeros_like(grad)
+  result: auto_linear
+
+- name: sparse_mask(Tensor self, Tensor mask) -> Tensor
+  self: sparse_mask_backward(grad, mask, self.layout())
+  mask: non_differentiable
+
+- name: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
+  indices: non_differentiable
+  values: grad.sparse_mask(result)._values()
+
+- name: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  compressed_indices: non_differentiable
+  plain_indices: non_differentiable
+  # TODO: remove to_dense after gh-107381 is fixed
+  values: grad.to_dense().sparse_mask(result).values()
+
+- name: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
+  self: at::_sparse_sum_backward(grad, self, dim)
+
+- name: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor
+  self: grad * _standard_gamma_grad(self, result)
+
+- name: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
+  self: not_implemented("_standard_gamma_grad")
+
+- name: values(Tensor(a) self) -> Tensor(a)
+  dispatch:
+    Default:
+      self: values_backward(grad, self)
+    AutogradNestedTensor:
+      self: at::_nested_view_from_buffer(grad.contiguous(), self._nested_tensor_size(), self._nested_tensor_strides(), self._nested_tensor_storage_offsets())
+
+# Why is _values() not differentiable?
+# See NOTE [ Sparse: autograd and API ]
+- name: _values(Tensor(a) self) -> Tensor(a)
+  output_differentiability: [False]
+
+# NN
+- name: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
+  i1, i2, i3: "_trilinear_backward(grad,
+               wrap_opt_if(i1, grad_input_mask[1] || grad_input_mask[2]),
+               wrap_opt_if(i2, grad_input_mask[0] || grad_input_mask[2]),
+               wrap_opt_if(i3, grad_input_mask[0] || grad_input_mask[1]),
+               expand1, expand2, expand3, sumdim, grad_input_mask)"
+  result: "_trilinear(i1_t, i2_p, i3_p, expand1, expand2, expand3, sumdim, unroll_dim) +
+           _trilinear(i1_p, i2_t, i3_p, expand1, expand2, expand3, sumdim, unroll_dim) +
+           _trilinear(i1_p, i2_p, i3_t, expand1, expand2, expand3, sumdim, unroll_dim)"
+
+- name: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
+  self: constant_pad_nd_backward(grad, pad)
+  result: constant_pad_nd_symint(self_t, pad, 0)
+
+- name: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  self: binary_cross_entropy_backward(grad, self, target, weight, reduction)
+  target: binary_cross_entropy_target_backward(grad, self, target, weight, reduction)
+  result: "apply_loss_reduction(
+               binary_cross_entropy_backward(self_t, self_p, target_p, weight, at::Reduction::None)
+             + binary_cross_entropy_target_backward(target_t, self_p, target_p, weight, at::Reduction::None),
+           reduction)"
+
+- name: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  self: binary_cross_entropy_double_backward(grad_output, grad, self, target, weight, reduction)
+  target: binary_cross_entropy_double_backward_target(grad, grad_output, self, target, weight, reduction)
+  grad_output: binary_cross_entropy_double_backward_grad_output(grad, self, target, weight, reduction)
+  result: " binary_cross_entropy_double_backward(grad_output_p, self_t, self_p, target_p, weight, reduction)
+          + binary_cross_entropy_double_backward_target(target_t, grad_output_p, self_p, target_p, weight, reduction)
+          + binary_cross_entropy_double_backward_grad_output(grad_output_t, self_p, target_p, weight, reduction)"
+
+- name: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
+  self: binary_cross_entropy_with_logits_backward(grad, self, target, weight, pos_weight, reduction)
+  target: binary_cross_entropy_with_logits_target_backward(grad, self, target, weight, pos_weight, reduction)
+  result: "apply_loss_reduction(
+               binary_cross_entropy_with_logits_backward(self_t, self_p, target_p, weight, pos_weight, at::Reduction::None)
+             + binary_cross_entropy_with_logits_target_backward(target_t, self_p, target_p, weight, pos_weight, at::Reduction::None),
+           reduction)"
+
+- name: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
+  indices: non_differentiable
+  weight: embedding_backward_symint(grad, indices, weight.sym_size(0), padding_idx, scale_grad_by_freq, sparse)
+  result: auto_linear
+
+- name: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor
+  grad_output: embedding_dense_double_backward_symint(grad, indices, padding_idx)
+  indices: non_differentiable
+  result: auto_linear
+
+- name: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
+  indices: non_differentiable
+  offsets: non_differentiable
+  weight: _embedding_bag_backward_symint(grad, indices, offsets, result1, result2, result3, weight.sym_size(0), scale_grad_by_freq, mode, sparse, per_sample_weights, padding_idx)
+  per_sample_weights: _embedding_bag_per_sample_weights_backward(grad, weight, indices, offsets, result1, mode, padding_idx)
+
+- name: _embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+  indices: non_differentiable
+  offset2bag: non_differentiable
+  bag_size: non_differentiable
+  maximum_indices: non_differentiable
+
+- name: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
+  indices: non_differentiable
+  self: not_implemented("embedding_renorm")
+
+- name: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  self: mse_loss_backward(grad, self, target, reduction)
+  target: mse_loss_backward(grad, target, self, reduction)
+  result: apply_loss_reduction(mse_loss_backward(self_t.conj(), self_p, target_p, at::Reduction::None).conj() + mse_loss_backward(target_t.conj(), target_p, self_p, at::Reduction::None).conj(), reduction)
+
+- name: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor
+  self: multi_margin_loss_backward(grad, self, target, p, margin, weight, reduction)
+  target: non_differentiable
+
+- name: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target)
+  self: multilabel_margin_loss_backward(grad, self, target, reduction, is_target)
+  target: non_differentiable
+
+- name: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+  self: nll_loss_backward_symint(grad, self, target, weight, reduction, ignore_index, total_weight)
+  target: non_differentiable
+  output: std::get<0>(nll_loss_forward_symint(self_t, target, weight, reduction, ignore_index))
+
+- name: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+  self: nll_loss2d_backward_symint(grad, self, target, weight, reduction, ignore_index, total_weight)
+  target: non_differentiable
+  output: std::get<0>(nll_loss2d_forward_symint(self_t, target, weight, reduction, ignore_index))
+
+- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
+  self: smooth_l1_loss_backward(grad, self, target, reduction, beta)
+  target: smooth_l1_loss_backward(grad, target, self, reduction, beta)
+  result: apply_loss_reduction(smooth_l1_loss_backward(self_t.conj(), self_p, target_p, at::Reduction::None, beta).conj() + smooth_l1_loss_backward(target_t.conj(), target_p, self_p, at::Reduction::None, beta).conj(), reduction)
+
+- name: huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor
+  self: huber_loss_backward(grad, self, target, reduction, delta)
+  target: huber_loss_backward(grad, target, self, reduction, delta)
+  result: apply_loss_reduction(huber_loss_backward(self_t.conj(), self_p, target_p, at::Reduction::None, delta).conj() + huber_loss_backward(target_t.conj(), target_p, self_p, at::Reduction::None, delta).conj(), reduction)
+
+- name: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  self: soft_margin_loss_backward(grad, self, target, reduction)
+  result: apply_loss_reduction(soft_margin_loss_backward(self_t.conj(), self_p, target, at::Reduction::None).conj(), reduction)
+
+- name: relu(Tensor self) -> Tensor
+  self: threshold_backward(grad, result, 0)
+  result: auto_element_wise
+
+- name: silu(Tensor self) -> Tensor
+  self: "GradMode::is_enabled() ? infinitely_differentiable_silu_backward(grad, self) : silu_backward(grad, self)"
+  result: auto_element_wise
+
+- name: mish(Tensor self) -> Tensor
+  self: "GradMode::is_enabled() ? infinitely_differentiable_mish_backward(grad, self) : mish_backward(grad, self)"
+  result: auto_element_wise
+
+- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
+  self: elu_backward(grad, alpha, scale, input_scale, /* is_result */ false, self)
+  result: auto_element_wise
+
+- name: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
+  self: elu_backward(grad, alpha, scale, input_scale, /* is_result */ true, result)
+  result: self_t.copy_(elu_backward(original_self_t, alpha, scale, input_scale, /* is_result */ true, result))
+
+- name: celu(Tensor self, Scalar alpha=1.0) -> Tensor
+  self: elu_backward(grad, alpha, 1, 1.0/alpha.toFloat(), /* is_result */ false, self)
+  result: auto_element_wise
+
+- name: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
+  self: elu_backward(grad, alpha, 1, 1.0/alpha.toFloat(), /* is_result */ true, result)
+  result: self_t.copy_(elu_backward(original_self_t, alpha, 1, 1.0/alpha.toFloat(), /* is_result */ true, result))
+
+- name: gelu(Tensor self, *, str approximate='none') -> Tensor
+  self: gelu_backward(grad, self, approximate)
+  result: auto_element_wise
+
+- name: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
+  grad_output: gelu_backward(grad, self, approximate)
+  self: gelu_double_backward(grad, grad_output, self, approximate)
+  result: gelu_backward(grad_output_t, self_p, approximate) + gelu_double_backward(self_t, grad_output_p, self_p, approximate)
+
+- name: glu(Tensor self, int dim=-1) -> Tensor
+  # TODO: glu_backward can benefit from forward result,
+  # and forward ad/forward over reverse ad for that matter
+  self: glu_backward(grad, self, dim)
+  result: glu_jvp(result, self_p, self_t, dim)
+
+- name: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+  self: hardshrink_backward(grad, self, lambd)
+  result: auto_element_wise
+
+- name: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
+  grad_out: hardshrink_backward(grad, self, lambd)
+  self: zeros_like(grad)
+  result: at::where((self_p > lambd).logical_or(self_p < -lambd), grad_out_t, at::zeros({}, result.options()).expand_as(result))
+
+- name: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
+  self: hardtanh_backward(grad, self, min_val, max_val)
+  result: auto_element_wise
+
+- name: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
+  self: leaky_relu_backward(grad, self, negative_slope, false)
+  result: auto_element_wise
+
+- name: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
+  self: leaky_relu_backward(grad, result, negative_slope, true)
+  result: self_t.copy_(leaky_relu_backward(original_self_t.conj(), result, negative_slope, true).conj())
+
+- name: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
+  self: log_sigmoid_backward(grad, self, buffer)
+  output: log_sigmoid_backward(self_t.conj(), self_p, buffer).conj()
+  output_differentiability: [True, False]
+
+- name: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  self: _log_softmax_backward_data(grad, result, dim, self.scalar_type())
+  result: self_t - logsumexp_jvp(self_p, self_t, {dim}, true)
+
+- name: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  self: _sparse_log_softmax_backward_data(grad, result, dim, self)
+
+- name: _masked_softmax(Tensor self, Tensor mask, int? dim=None, int? mask_type=None) -> Tensor
+  self: _masked_softmax_backward(grad, result, mask, dim)
+  mask: non_differentiable
+
+- name: _prelu_kernel(Tensor self, Tensor weight) -> Tensor
+  self, weight: "grad.defined() ? _prelu_kernel_backward(grad, self, weight) : std::tuple<Tensor, Tensor>()"
+  result: at::where(self_p >= 0, self_t, weight_p * self_t + weight_t * self_p)
+
+- name: _prelu_kernel_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
+  grad_output: "grads[0].defined() ?
+                (grads[1].defined() ? at::where(self >= 0, grads[0], grads[0] * weight + grads[1] * self)
+                                    : at::where(self >= 0, grads[0], grads[0] * weight))
+                                    : at::where(self >= 0, at::zeros({}, grad_output.options()), grads[1] * self)"
+  self: "grads[1].defined() ? at::where(self >= 0, at::zeros({}, self.options()), grad_output * grads[1]) : zeros_like(self)"
+  weight: "grads[0].defined() ? at::where(self >= 0, at::zeros({}, weight.options()), grad_output * grads[0]) : zeros_like(self)"
+  result0: at::where(self_p >= 0, grad_output_t, grad_output_t * weight_p + grad_output_p * weight_t)
+  result1: at::where(self_p >= 0, at::zeros({}, self_p.options()), grad_output_p * self_t + grad_output_t * self_p)
+
+- name: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
+  self: rrelu_with_noise_backward(grad, self, noise, lower, upper, training, false)
+  result: auto_element_wise
+
+- name: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
+  self: rrelu_with_noise_backward(grad, result, noise, lower, upper, training, true)
+
+- name: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  self: _softmax_backward_data(grad, result, dim, self.scalar_type())
+  result: result * (self_t - logsumexp_jvp(self_p, self_t, {dim}, true))
+
+- name: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  self: _sparse_softmax_backward_data(grad, result, dim, self)
+
+- name: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
+  self: sparse_sparse_matmul_backward(grad, self, other, 0)
+  other: sparse_sparse_matmul_backward(grad, self, other, 1)
+
+- name: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
+  self: softplus_backward(grad, self, beta, threshold)
+  result: auto_element_wise
+
+- name: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+  self: softshrink_backward(grad, self, lambd)
+  result: auto_element_wise
+
+- name: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
+  self: threshold_backward(grad, self, threshold)
+  result: auto_element_wise
+
+- name: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
+  self: threshold_backward(grad, self, threshold)
+  result: self_t.copy_(threshold_backward(self_t.conj(), original_self_p, threshold).conj())
+
+- name: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+  self: reflection_pad1d_backward_symint(grad, self, padding)
+  result: auto_linear
+
+- name: reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+  self: reflection_pad2d_backward_symint(grad, self, padding)
+  result: auto_linear
+
+- name: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+  self: reflection_pad3d_backward_symint(grad, self, padding)
+  result: auto_linear
+
+- name: replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+  self: replication_pad1d_backward_symint(grad, self, padding)
+  result: auto_linear
+
+- name: replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+  self: replication_pad2d_backward_symint(grad, self, padding)
+  result: auto_linear
+
+- name: replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+  self: replication_pad3d_backward_symint(grad, self, padding)
+  result: auto_linear
+
+- name: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
+  self: upsample_linear1d_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales)
+  result: auto_linear
+
+- name: upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  self: upsample_bilinear2d_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales_h, scales_w)
+  result: auto_linear
+
+- name: _upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  self: _upsample_bilinear2d_aa_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales_h, scales_w)
+  result: auto_linear
+
+- name: upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  self: upsample_bicubic2d_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales_h, scales_w)
+  result: auto_linear
+
+- name: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  self: _upsample_bicubic2d_aa_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales_h, scales_w)
+  result: auto_linear
+
+- name: upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  self: upsample_trilinear3d_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales_d, scales_h, scales_w)
+  result: auto_linear
+
+- name: upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+  self: upsample_nearest1d_backward_symint(grad, output_size, self.sym_sizes(), scales)
+  result: auto_linear
+
+- name: _upsample_nearest_exact1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+  self: _upsample_nearest_exact1d_backward_symint(grad, output_size, self.sym_sizes(), scales)
+  result: auto_linear
+
+- name: upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  self: upsample_nearest2d_backward_symint(grad, output_size, self.sym_sizes(), scales_h, scales_w)
+  result: auto_linear
+
+- name: _upsample_nearest_exact2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  self: _upsample_nearest_exact2d_backward_symint(grad, output_size, self.sym_sizes(), scales_h, scales_w)
+  result: auto_linear
+
+- name: upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  self: upsample_nearest3d_backward_symint(grad, output_size, self.sym_sizes(), scales_d, scales_h, scales_w)
+  result: auto_linear
+
+- name: _upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  self: _upsample_nearest_exact3d_backward_symint(grad, output_size, self.sym_sizes(), scales_d, scales_h, scales_w)
+  result: auto_linear
+
+- name: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+  self: pixel_unshuffle(grad, upscale_factor)
+  result: auto_linear
+
+- name: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
+  self: pixel_shuffle(grad, downscale_factor)
+  result: auto_linear
+
+- name: _adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+  self: _adaptive_avg_pool2d_backward(grad, self)
+  result: auto_linear
+
+- name: _adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+  self: _adaptive_avg_pool3d_backward(grad, self)
+  result: auto_linear
+
+- name: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
+  self: adaptive_max_pool2d_backward(grad, self, result1)
+  result0: gather(self_t.flatten(-2), -1, result1.flatten(-2)).view_as(result1)
+  output_differentiability: [True, False]
+
+- name: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
+  self: adaptive_max_pool3d_backward(grad, self, result1)
+  result0: gather(self_t.flatten(-3), -1, result1.flatten(-3)).view_as(result1)
+  output_differentiability: [True, False]
+
+- name: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+  self: avg_pool2d_backward(grad, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
+  result: auto_linear
+
+- name: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+  self: avg_pool3d_backward(grad, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
+  result: auto_linear
+
+- name: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor)
+  self: fractional_max_pool2d_backward(grad, self, kernel_size, output_size, result1)
+  result0: gather(self_t.flatten(-2), -1, result1.flatten(-2)).view_as(result1)
+  output_differentiability: [True, False]
+
+- name: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor)
+  self: fractional_max_pool3d_backward(grad, self, kernel_size, output_size, result1)
+  result0: gather(self_t.flatten(-3), -1, result1.flatten(-3)).view_as(result1)
+  output_differentiability: [True, False]
+
+- name: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+  input, weight, bias: "grad.defined() ? linear_backward(input, grad, weight, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  self, grad_output, weight: linear_double_backward(grads, self, grad_output, weight)
+
+#mps
+- name: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  self: max_pool2d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode)
+
+- name: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  self, weight, bias: "grad.defined() ? mps_convolution_backward_symint(self, grad, weight, padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  grad_output, self, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask)
+
+- name: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+  self: max_pool2d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, result1)
+  result0: gather(self_t.flatten(-2), -1, result1.flatten(-2)).view_as(result1)
+  output_differentiability: [True, False]
+
+- name: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+  self: max_pool3d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, result1)
+  result0: gather(self_t.flatten(-3), -1, result1.flatten(-3)).view_as(result1)
+  output_differentiability: [True, False]
+
+- name: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
+  self: max_pool_double_backward(grad, indices, 2)
+  indices: non_differentiable
+  result: auto_linear
+
+- name: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
+  self: max_pool_double_backward(grad, indices, 3)
+  indices: non_differentiable
+  result: auto_linear
+
+- name: convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  result: convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups)
+
+# TorchScript serializes calls to _convolution so this entry is present until that is changed to use convolution.
+# Note that the benchmark, deterministic, cudnn_enabled, and allow_tf32 flags are queried from the global context
+# by convolution_backward instead of being passed along from the forward pass.
+- name: _convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  result: _convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32)
+
+- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
+  result0: std::get<0>(convolution_backward_symint(grad_output_p, input_p, weight_t, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false})) + std::get<0>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false}))
+  result1: std::get<1>(convolution_backward_symint(grad_output_p, input_t, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false})) + std::get<1>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false}))
+  result2: convolution_backward_jvp_grad_bias(grad_output_t, result2)
+
+- name: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+  input, weight, bias: "grad.defined() ? convolution_backward_overrideable_symint(grad, input, weight, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+  grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
+
+- name: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor
+  self, weight, bias: "grad.defined() ? _slow_conv2d_backward_symint(grad, self, weight, kernel_size, stride, padding, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+  grad_output, self, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, grad_input_mask)
+
+- name: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, /*dilation=*/ {{1, 1, 1}}, false, /*output_padding=*/ {{0, 0, 0}}, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+  self: im2col(grad, kernel_size, dilation, padding, stride)
+  result: auto_linear
+
+- name: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+  self: col2im_symint(grad, {self.sym_size(-2), self.sym_size(-1)}, kernel_size, dilation, padding, stride)
+  result: auto_linear
+
+- name: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
+  grad_output: _adaptive_avg_pool2d_symint(grad, {grad_output.sym_size(-2), grad_output.sym_size(-1)})
+  self: zeros_like(self)
+  result: _adaptive_avg_pool2d_backward(grad_output_t, self_p)
+
+- name: _adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
+  grad_output: _adaptive_avg_pool3d_symint(grad, { grad_output.sym_size(-3), grad_output.sym_size(-2), grad_output.sym_size(-1) })
+  self: zeros_like(self)
+  result: _adaptive_avg_pool3d_backward(grad_output_t, self_p)
+
+- name: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+  grad_output: max_pool_double_backward(grad, indices, 2)
+  self: zeros_like(self)
+  result: auto_linear
+
+- name: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+  grad_output: max_pool_double_backward(grad, indices, 3)
+  self: zeros_like(self)
+  result: auto_linear
+
+- name: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
+  grad_output: avg_pool2d(grad, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
+  self: zeros_like(self)
+  result: avg_pool2d_backward(grad_output_t, self_p, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
+
+- name: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
+  grad_output: avg_pool3d(grad, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
+  self: zeros_like(self)
+  result: avg_pool3d_backward(grad_output_t, self_p, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
+
+- name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
+  grad_output: elu_backward(grad, alpha, scale, input_scale, is_result, self_or_result)
+  self_or_result: elu_double_backward(grad, grad_output, alpha, scale, input_scale, is_result, self_or_result)
+  result: elu_backward(grad_output_t, alpha, scale, input_scale, is_result, self_or_result_p) + elu_double_backward(self_or_result_t, grad_output_p, alpha, scale, input_scale, is_result, self_or_result_p)
+
+- name: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor
+  grad_output: max_pool_double_backward(grad, indices, 2)
+  self: zeros_like(self)
+  result: auto_linear
+
+- name: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor
+  grad_output: max_pool_double_backward(grad, indices, 3)
+  self: zeros_like(self)
+  result: auto_linear
+
+- name: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
+  grad_output: glu_double_backward_grad_output(grad, self, dim)
+  self: glu_double_backward(grad, grad_output, self, dim)
+  result: glu_backward_jvp(result, grad_output_p, self_p, grad_output_t, self_t, dim)
+
+- name: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
+  grad_output: hardtanh_backward(grad, self, min_val, max_val)
+  self: zeros_like(grad)
+  result: at::where((self_p > min_val).logical_and(self_p < max_val), grad_output_t, at::zeros({}, result.options()).expand_as(result))
+
+- name: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
+  grad_output: log_sigmoid_backward(grad, self, buffer)
+  self: log_sigmoid_double_backward(grad * grad_output, self)
+  result: log_sigmoid_backward(grad_output_t, self_p, buffer) + log_sigmoid_double_backward(self_t * grad_output_p, self_p)
+
+- name: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
+  grad_output: grad.to(output.dtype()) - (grad.to(output.dtype()) * output.exp()).sum(dim, true)
+  output: (-grad_output.sum(dim, true) * output.exp() * grad.to(output.dtype())).to(output.dtype())
+
+- name: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
+  # self_is_result is always false here since double backward call is an out-of-place call, self is input itself
+  grad_output: leaky_relu_backward(grad, self, negative_slope, false)
+  self: zeros_like(grad)
+  # leaky_relu_backward(grad_output, self, negative_slope, false)
+  # computes grad_output * at::where(self_p > 0, 1, negative_slope)
+  # so the jvp formula is the following:
+  # grad_output_t * at::where(self_p > 0, self_p.new_ones([]), negative_slope);
+  #
+  # leaky_relu_backward(grad_output, result, negative_slope, true)
+  # computes grad_output * at::where(result > 0, 1, negative_slope)
+  # under the assumption that `negative_slope` is positive (otherwise,
+  # it is not possible to compute the gradient).
+  #
+  # so the jvp formula is the following:
+  # grad_output_t * at::where(result_p > 0, result_p.new_ones([]), negative_slope);
+  # with the assumption that negative_slope is positive.
+  #
+  # Combined together that results in the following optimized kernel which
+  # also checks the assumption that negative_slope is positive when self_is_result
+  # is True:
+  result: leaky_relu_backward(grad_output_t, self_p, negative_slope, self_is_result)
+
+# This derivative is mps-only, and `error_for_max_pool2d_double_backward` just raises an error.
+- name: max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  grad_output: error_for_max_pool2d_double_backward()
+  self: zeros_like(self)
+  result: auto_linear
+
+- name: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
+  grad_output: max_pool_double_backward(grad, indices, 2)
+  self: zeros_like(self)
+  indices: non_differentiable
+  result: auto_linear
+
+- name: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
+  grad_output: max_pool_double_backward(grad, indices, 3)
+  self: zeros_like(self)
+  indices: non_differentiable
+  result: auto_linear
+
+- name: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+  grad_output: mse_loss_backward(grad, self, target, reduction)
+  self: mse_loss_double_backward(grad * grad_output, self, reduction)
+  target: -mse_loss_double_backward(grad * grad_output, target, reduction)
+  result: "  mse_loss_double_backward(self_t * grad_output_p, self_p, reduction)
+           - mse_loss_double_backward(target_t * grad_output_p, target_p, reduction)
+           + mse_loss_backward(grad_output_t, self_p, target_p, reduction)
+          "
+
+- name: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+  grad_output: nll_loss_symint(grad, target, weight, reduction, ignore_index)
+  self: zeros_like(grad)
+  target: non_differentiable
+
+- name: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+  grad_output: nll_loss2d_symint(grad, target, weight, reduction, ignore_index)
+  self: zeros_like(grad)
+  target: non_differentiable
+
+- name: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
+  # self_is_result is always false here since double backward call is an out-of-place call, self is input itself
+  grad_output: rrelu_with_noise_backward(grad, self, noise, lower, upper, training, false)
+  self: zeros_like(grad)
+  result: rrelu_with_noise_backward(grad_output_t, self_p, noise, lower, upper, training, false)
+
+- name: reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+  grad_output: reflection_pad1d_symint(grad, padding)
+  self: zeros_like(self)
+  result: reflection_pad1d_backward_symint(grad_output_t, self_p, padding)
+
+- name: reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+  grad_output: reflection_pad2d_symint(grad, padding)
+  self: zeros_like(self)
+  result: reflection_pad2d_backward_symint(grad_output_t, self_p, padding)
+
+- name: reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+  grad_output: reflection_pad3d_symint(grad, padding)
+  self: zeros_like(self)
+  result: reflection_pad3d_backward_symint(grad_output_t, self_p, padding)
+
+- name: replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+  grad_output: replication_pad1d_symint(grad, padding)
+  self: zeros_like(self)
+  result: replication_pad1d_backward_symint(grad_output_t, self_p, padding)
+
+- name: replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+  grad_output: replication_pad2d_symint(grad, padding)
+  self: zeros_like(self)
+  result: replication_pad2d_backward_symint(grad_output_t, self_p, padding)
+
+- name: replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+  grad_output: replication_pad3d_symint(grad, padding)
+  self: zeros_like(self)
+  result: replication_pad3d_backward_symint(grad_output_t, self_p, padding)
+
+- name: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  self, mat1, mat2: "sparse_sampled_addmm_backward(grad,
+                                                   self,
+                                                   wrap_opt_if(mat1, grad_input_mask[2]),
+                                                   wrap_opt_if(mat2, grad_input_mask[1]),
+                                                   alpha, beta, grad_input_mask)"
+
+- name: _sparse_mm_reduce_impl(Tensor self, Tensor other, str reduce) -> (Tensor, Tensor)
+  output_differentiability: [True, False]
+  self, other: "grad.defined() ? _sparse_mm_reduce_impl_backward(self, grad, other, reduce, result1, grad_input_mask) :  std::tuple<Tensor, Tensor>()"
+
+- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
+  grad_output: smooth_l1_loss_backward(grad, self, target, reduction, beta)
+  self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
+  target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
+  result: "  smooth_l1_loss_double_backward(self_t * grad_output_p, self_p, target_p, reduction, beta)
+           - smooth_l1_loss_double_backward(target_t * grad_output_p, self_p, target_p, reduction, beta)
+           + smooth_l1_loss_backward(grad_output_t, self_p, target_p, reduction, beta)
+          "
+
+- name: huber_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta) -> Tensor
+  grad_output: huber_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, delta)
+  self: huber_loss_double_backward(grad * grad_output, self, target, reduction, delta)
+  target: -huber_loss_double_backward(grad * grad_output, self, target, reduction, delta)
+
+- name: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor
+  grad_output: softplus_backward(grad, self, beta, threshold)
+  self: softplus_double_backward(grad * grad_output, self, beta, threshold)
+  result: "softplus_backward(grad_output_t, self_p, beta, threshold)
+         + softplus_double_backward(self_t * grad_output_p, self_p, beta, threshold)"
+
+- name: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
+  grad_output: _softmax_backward_data(grad.to(output.dtype()), output, dim, input_dtype)
+  output: softmax_double_backward(grad.to(output.dtype()), grad_output, dim, output).to(output.dtype())
+
+- name: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+  grad_output: soft_margin_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
+  self: soft_margin_loss_double_backward(grad * grad_output, self, target, reduction)
+
+- name: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
+  grad_output: softshrink_backward(grad, self, lambd)
+  self: zeros_like(grad)
+  result: at::where((self_p > lambd).logical_or(self_p < -lambd), grad_output_t, at::zeros({}, result.options()).expand_as(result))
+
+- name: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
+  grad_output: threshold_backward(grad, self, threshold)
+  self: zeros_like(grad)
+  result: zeros_like(self_t) + threshold_backward(grad_output_t, self_p, threshold)
+
+- name: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
+  grad_output: upsample_linear1d_symint(grad, output_size, align_corners, scales)
+  result: auto_linear
+
+- name: upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  grad_output: upsample_bilinear2d_symint(grad, output_size, align_corners, scales_h, scales_w)
+  result: auto_linear
+
+- name: _upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  grad_output: _upsample_bilinear2d_aa_symint(grad, output_size, align_corners, scales_h, scales_w)
+  result: auto_linear
+
+- name: upsample_bicubic2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  grad_output: upsample_bicubic2d_symint(grad, output_size, align_corners, scales_h, scales_w)
+  result: auto_linear
+
+- name: _upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  grad_output: _upsample_bicubic2d_aa_symint(grad, output_size, align_corners, scales_h, scales_w)
+  result: auto_linear
+
+- name: upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  grad_output: upsample_trilinear3d_symint(grad, output_size, align_corners, scales_d, scales_h, scales_w)
+  result: auto_linear
+
+- name: upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+  grad_output: upsample_nearest1d_symint(grad, output_size, scales)
+  result: auto_linear
+
+- name: _upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+  grad_output: _upsample_nearest_exact1d_symint(grad, output_size, scales)
+  result: auto_linear
+
+- name: upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  grad_output: upsample_nearest2d_symint(grad, output_size, scales_h, scales_w)
+  result: auto_linear
+
+- name: _upsample_nearest_exact2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  grad_output: _upsample_nearest_exact2d_symint(grad, output_size, scales_h, scales_w)
+  result: auto_linear
+
+- name: upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  grad_output: upsample_nearest3d_symint(grad, output_size, scales_d, scales_h, scales_w)
+  result: auto_linear
+
+- name: _upsample_nearest_exact3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  grad_output: _upsample_nearest_exact3d_symint(grad, output_size, scales_d, scales_h, scales_w)
+  result: auto_linear
+
+- name: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
+  grad_output: sigmoid_backward(grad, output.conj())
+  output: grad.conj() * grad_output * (-2 * output.conj() + 1)
+  result: sigmoid_backward(grad_output_t, output_p) + output_t.conj() * grad_output_p * (-2 * output_p.conj() + 1)
+
+- name: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
+  grad_output: tanh_backward(grad, output.conj())
+  output: grad.conj() * (-2 * output.conj() * grad_output)
+  result: tanh_backward(grad_output_t, output_p) + output_t.conj() * (-2 * output_p.conj() * grad_output_p)
+
+# cudnn
+- name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
+  log_probs: _cudnn_ctc_loss_backward(grad, result0, result1, zero_infinity)
+
+- name: _cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
+  log_probs: _cudnn_ctc_loss_backward(grad, result0, result1, zero_infinity)
+
+- name: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+  self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, output_padding, stride, dilation, true, groups, {grad_input_mask[0], grad_input_mask[1]})"
+
+- name: _mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  self, weight: "grad.defined() ? mps_convolution_transpose_backward_symint(self, grad, weight, padding, output_padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor>()"
+
+- name: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+  self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, std::vector<c10::SymInt>(padding.size(), 0), stride, dilation, false, groups, {grad_input_mask[0], grad_input_mask[1]})"
+
+- name: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output
+  self, grid: "grad.defined() ? cudnn_grid_sampler_backward(self, grid, grad) : std::tuple<Tensor, Tensor>()"
+
+- name: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid
+  theta: cudnn_affine_grid_generator_backward(grad, N, C, H, W)
+
+# NB: Why is the backwards here so complicated?  CuDNN cannot be used to compute
+# backward in evaluation mode, because the math for backward in evaluation mode
+# is different (since the forward math is different), and CuDNN does not support
+# it.  And in any case, you shouldn't be using this bn in evaluation mode,
+# because it should be merged into the previous convolution (left for future
+# work.)
+# NB2: The quotes around the gradient are needed to appease YAML parsing rules.
+- name: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? (training ? cudnn_batch_norm_backward(input, grad.contiguous(input.suggest_memory_format()), weight, running_mean, running_var, result1, result2, epsilon, retain_variables ? result3.clone() : result3) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, epsilon)
+
+# HACK: save_mean and save_var are going to be passed in as
+# requires_grad variables (even though we'll never backprop through
+# them) so we need to prevent the unpacking from triggering an error.
+- name: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)
+  save_mean: not_implemented("cudnn_batch_norm_backward save_mean")
+  save_var: not_implemented("cudnn_batch_norm_backward save_var")
+  reserveSpace: not_implemented("cudnn_batch_norm_backward reserveSpace")
+  input, weight, grad_output: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_output, running_mean, running_var, true, epsilon, save_mean, save_var, grad_input_mask)
+
+# nnpack
+
+- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor
+  # NNPACK does not support strided convolutions in the backwards path, which is the reason why we are using the closest available function that does here.
+  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<c10::SymInt>(padding.size(), 1), false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+#LSTM MPS
+- name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+  output_differentiability: [True, True, True, False, False, False]
+  input, hx, params: "lstm_mps_backward(grads[0], grads[1], grads[2], result3, result4, input, result5, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first)"
+
+- name: lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+
+
+
+# Only frst three of _cudnn_rnn outputs can have gradients.
+# _cudnn_rnn outputs: (output, hy, cy, reserve, weight_buf)
+- name: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  dropout_state: non_differentiable
+  output_differentiability: [True, True, True, False, False]
+  input, hx, cx, weight: "_cudnn_rnn_backward_symint(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
+
+- name: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+  dropout_state: non_differentiable
+  input: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
+  weight: not_implemented_list("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
+  hx: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
+  cx: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
+  output: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
+  grad_output: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
+  grad_hy: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
+  grad_cy: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
+
+# miopen
+
+- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)
+  save_mean: not_implemented("miopen_batch_norm_backward save_mean")
+  save_var: not_implemented("miopen_batch_norm_backward save_var")
+  input, weight, grad_output: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_output, running_mean, running_var, true, epsilon, save_mean, save_var, grad_input_mask)
+
+- name: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  dropout_state: non_differentiable
+  output_differentiability: [True, True, True, False, False]
+  input, hx, cx, weight: "miopen_rnn_backward(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
+
+- name: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+  dropout_state: non_differentiable
+
+- name: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor)
+  output_differentiability: [True, True, True, False]
+  input, weight0, weight1, weight2, weight3, hx_, cx_: "GradMode::is_enabled() ? mkldnn_rnn_layer_differentiable_backward(input, weight0, weight1, weight2, weight3, hx_, cx_, result0, result1, result2, grads[0], grads[1], grads[2], reverse, mode, hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes, batch_first, result3) : mkldnn_rnn_layer_backward(input, weight0, weight1, weight2, weight3, hx_, cx_, result0, result1, result2, grads[0], grads[1], grads[2], reverse, mode, hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes, batch_first, result3)"
+
+- name: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+
+# mkldnn
+- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
+  self, weight, bias: mkldnn_linear_backward(self, grad, weight, grad_input_mask)
+
+- name: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  self: mkldnn_max_pool2d_backward(grad, result, self, kernel_size, stride, padding, dilation, ceil_mode)
+
+- name: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+  self: mkldnn_max_pool3d_backward(grad, result, self, kernel_size, stride, padding, dilation, ceil_mode)
+
+- name: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
+  self: mkldnn_adaptive_avg_pool2d_backward(grad, self)
+
+- name: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
+  self: grad.reshape_symint(self.sym_sizes())
+
+# NestedTensor
+- name: _nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  list: "grad.defined()? at::unbind(grad) : std::vector<Tensor>(list.size())"
+
+- name: _nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor
+  t: grad.to_padded_tensor_symint(0, t.sym_sizes())
+  mask: non_differentiable
+
+- name: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor
+  padded: _nested_from_padded_backward(grad, padded, fuse_transform_0213)
+  cpu_nested_shape_example: non_differentiable
+
+- name: to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor
+  self: at::_nested_from_padded(grad, self._nested_tensor_size())
+  padding: non_differentiable
+
+- name:  _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a)
+  self: grad.values()
+  nested_size: non_differentiable
+  nested_strides: non_differentiable
+
+- name: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a)
+  self: grad.values()
+  offsets: non_differentiable
+  lengths: non_differentiable
+  dummy: non_differentiable
+
+- name: _nested_get_values(Tensor(a) self) -> Tensor(a)
+  self: _nested_view_from_jagged(grad, at::_nested_get_offsets(self), at::_nested_get_jagged_dummy(self), at::_nested_get_lengths(self), at::_nested_get_ragged_idx(self))
+
+# Transformers
+- name: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
+  output_differentiability: [True, False, False, False]
+  query, key, value, attn_bias: _scaled_dot_product_efficient_attention_backward(grad, query, key, value, attn_bias, output, log_sumexp, philox_seed, philox_offset, dropout_p, grad_input_mask, is_causal, scale)
+
+- name: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+  output_differentiability: [True, False, False, False, False, False, False, False, False]
+  query, key, value: _scaled_dot_product_flash_attention_backward_symint(grad, query, key, value, output, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale)
+
+- name: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp)
+  output_differentiability: [True, False]
+  query, key, value: _scaled_dot_product_flash_attention_for_cpu_backward(grad, query, key, value, output, logsumexp, dropout_p, is_causal, attn_mask, scale)
+
+- name: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+  output_differentiability: [True, False, False, False, False]
+  query, key, value: _flash_attention_backward_symint(grad, query, key, value, output, softmax_logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale)
+
+- name: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
+  output_differentiability: [True, False, False, False, False, False]
+  query, key, value, bias: _efficient_attention_backward_symint(grad, query, key, value, bias, output, cu_seqlens_q, cu_seqlens_k, max_seqlen_batch_q, max_seqlen_batch_k, logsumexp, dropout_p, philox_seed, philox_offset, custom_mask_type, bias.requires_grad(), scale)
+
+# fft
+- name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
+  self: fft_r2c_backward(grad, dim, normalization, onesided, self.sym_size(dim.back()))
+  result: auto_linear
+
+- name: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
+  self: fft_c2r_backward(grad, dim, normalization)
+  result: auto_linear
+
+- name: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
+  self: _fft_c2c_symint(grad, dim, normalization, !forward)
+  result: auto_linear
+
+- name: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
+  dispatch:
+    Default:
+      self: unbind_backward(grads, dim)
+      result: auto_linear
+    AutogradNestedTensor:
+      self: unbind_backward_nested(grads, at::native::get_nested_tensor_impl(self)->get_nested_sizes(), dim, self.options())
+      result: auto_linear
+
+- name: stack(Tensor[] tensors, int dim=0) -> Tensor
+  tensors: stack_tensors_backward(grad, dim, to_args_scalartypes(tensors))
+  result: stack_jvp(tensors, dim)
+
+# fused RNN kernels
+
+# Only frst two of _thnn_fused_lstm_cell outputs can have gradients.
+# _thnn_fused_lstm_cell outputs: (hy, cy, workspace)
+- name: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
+  output_differentiability: [True, True, False]
+  input_gates, hidden_gates, cx, input_bias, hidden_bias: "GradMode::is_enabled() ? _thnn_differentiable_lstm_cell_backward(grads[0], grads[1], input_gates, hidden_gates, input_bias, hidden_bias, cx, result1) : _thnn_fused_lstm_cell_backward(grads[0], grads[1], cx, result1, result2, input_bias.defined())"
+
+- name: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)
+  input_gates, hidden_gates, hx, input_bias, hidden_bias: "grad.defined() ? (GradMode::is_enabled() ? _thnn_differentiable_gru_cell_backward(grad, input_gates, hidden_gates, hx, input_bias, hidden_bias) : _thnn_fused_gru_cell_backward(grad, result1, input_bias.defined())) : std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>()"
+
+# PackedSequence helpers
+- name: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
+  input: _pack_padded_sequence_backward_symint(grad, input.sym_sizes(), result1, batch_first)
+
+# TH wrappers
+- name: eq.Scalar(Tensor self, Scalar other) -> Tensor
+  output_differentiability: [False]
+
+- name: eq.Tensor(Tensor self, Tensor other) -> Tensor
+  output_differentiability: [False]
+
+- name: ge.Scalar(Tensor self, Scalar other) -> Tensor
+  output_differentiability: [False]
+
+- name: ge.Tensor(Tensor self, Tensor other) -> Tensor
+  output_differentiability: [False]
+
+- name: gt.Scalar(Tensor self, Scalar other) -> Tensor
+  output_differentiability: [False]
+
+- name: gt.Tensor(Tensor self, Tensor other) -> Tensor
+  output_differentiability: [False]
+
+- name: le.Scalar(Tensor self, Scalar other) -> Tensor
+  output_differentiability: [False]
+
+- name: le.Tensor(Tensor self, Tensor other) -> Tensor
+  output_differentiability: [False]
+
+- name: lt.Scalar(Tensor self, Scalar other) -> Tensor
+  output_differentiability: [False]
+
+- name: lt.Tensor(Tensor self, Tensor other) -> Tensor
+  output_differentiability: [False]
+
+- name: ne.Scalar(Tensor self, Scalar other) -> Tensor
+  output_differentiability: [False]
+
+- name: ne.Tensor(Tensor self, Tensor other) -> Tensor
+  output_differentiability: [False]
+
+- name: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
+  output_differentiability: [False]
+
+- name: nonzero(Tensor self) -> Tensor
+  output_differentiability: [False]
+
+- name: segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor
+  data: _segment_reduce_backward(grad, result, data, reduce, lengths, offsets, axis, initial)
+
+- name: _pin_memory(Tensor self, Device? device=None) -> Tensor
+  self: grad
+
+- name: _new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor
+  self: non_differentiable
+  other: non_differentiable
+  output_differentiability: [False]
+
+- name: _test_warn_in_autograd(Tensor self) -> Tensor
+  self: warn_backwards(grad)
+
+- name: _test_autograd_multiple_dispatch.fullcoverage(Tensor self) -> Tensor
+  dispatch:
+    Default:
+      self: grad.expand_symint(self.sym_sizes()) + 1
+      result: auto_linear
+    AutogradNestedTensor:
+      self: grad.mul(grad)
+    AutogradCUDA:
+      self: grad.expand_symint(self.sym_sizes()) * 2
+
+- name: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor
+  dispatch:
+    AutogradNestedTensor:
+      self: grad.mul(grad).add(grad)
+
+- name: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)
+  dispatch:
+    Default:
+      self: grad.reshape_as(self)
+    AutogradCUDA:
+      self: grad.reshape_as(self) + 1
+
+- name: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  output_differentiability: [False]
+
+- name: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
+  self, src: scatter_reduce_backward(grad, self, dim, index, src, reduce, include_self, result)
+  index: non_differentiable
+  result: scatter_reduce_jvp(self_p, self_t, dim, index, src_p, src_t, reduce, include_self, result)
+
+- name: special_airy_ai(Tensor x) -> Tensor
+  x: non_differentiable
+
+- name: special_bessel_j0(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: special_bessel_j1(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: special_bessel_y0(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: special_bessel_y1(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: special_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_hermite_polynomial_h(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_hermite_polynomial_he(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_hermite_polynomial_he.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_hermite_polynomial_he.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_laguerre_polynomial_l(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_legendre_polynomial_p(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_legendre_polynomial_p.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_legendre_polynomial_p.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_modified_bessel_i0(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: special_modified_bessel_i1(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: special_modified_bessel_k0(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: special_modified_bessel_k1(Tensor self) -> Tensor
+  self: non_differentiable
+
+- name: special_scaled_modified_bessel_k0(Tensor x) -> Tensor
+  x: non_differentiable
+
+- name: special_scaled_modified_bessel_k1(Tensor x) -> Tensor
+  x: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
+  x: non_differentiable
+  n: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
+  n: non_differentiable
+
+- name: special_shifted_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
+  x: non_differentiable
+
+- name: special_spherical_bessel_j0(Tensor x) -> Tensor
+  x: non_differentiable
+
+- name: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
+  self: grad.reshape_symint(self.sym_sizes())
+  result: auto_linear
+
+# note(crcrpar): `torchgen/api/autograd` logic would unwantedly replace substrings of `self` and `other` of function names.
+- name: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  self: div_tensor_self_backward(grads[i], other[i], self[i].scalar_type())
+  other: div_tensor_other_backward(grads[i], self[i], other[i])
+  result: (self_t - other_t * result[i]) / other_p
+
+- name: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
+  self: pow_backward_self(grads[i], self[i], exponent[i])
+  exponent: pow_backward_exponent(grads[i], self[i], exponent[i], result[i])
+  result: (pow_backward_self(self_t.conj(), self_p, exponent_p) + pow_backward_exponent(exponent_t.conj(), self_p, exponent_p, result[i])).conj()
+
+- name: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
+  self: pow_backward(grads[i], self[i], exponent[i])
+  result: pow_backward(self_t.conj(), self_p, exponent[i]).conj()
+
+- name: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
+  exponent: pow_backward_exponent(grads[i], self, exponent[i], result[i])
+
+# note(crcrpar): following definitions seem necessary because the reference native functions
+# of `maximum` and `minimum` don't have the overload def with Scalar as their second argument.
+- name: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  self: at::where(self[i] == scalar, grads[i] / 2, grads[i]).masked_fill_(self[i] > scalar, 0)
+  result: scalar + at::where(self_p == scalar, at::scalar_tensor(0.5, result[i].options()), (self_p < scalar).to(result[i].scalar_type())) * (self_t - scalar)
+
+- name: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  self: at::where(self[i] == scalars[i], grads[i] / 2, grads[i]).masked_fill_(self[i] > scalars[i], 0)
+  result: scalars[i] + at::where(self_p == scalars[i], at::scalar_tensor(0.5, result[i].options()), (self_p < scalars[i]).to(result[i].scalar_type())) * (self_t - scalars[i])
+
+- name: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  self: at::where(self[i] == scalar, grads[i] / 2, grads[i]).masked_fill_(self[i] < scalar, 0)
+  result: scalar + at::where(self_p == scalar, at::scalar_tensor(0.5, result[i].options()), (self_p > scalar).to(result[i].scalar_type())) * (self_t - scalar)
+
+- name: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  self: at::where(self[i] == scalars[i], grads[i] / 2, grads[i]).masked_fill_(self[i] < scalars[i], 0)
+  result: scalars[i] + at::where(self_p == scalars[i], at::scalar_tensor(0.5, result[i].options()), (self_p > scalars[i]).to(result[i].scalar_type())) * (self_t - scalars[i])
+
+# note(crcrpar): forward-mode AD is tricky for a simple string replace to handle:
+#   formula.replace("p", "ord") produces `norm_jvord(self_ord, self_t, ord, result)`
+- name: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
+  self: norm_backward(grads[i], self[i], ord, result[i])
+  result: norm_jvp(self_p, self_t, ord, result[i])
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_annotated_fn_args.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_annotated_fn_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf47f614d2b30acab97d3f13dde2dcae2eddd89
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_annotated_fn_args.py
@@ -0,0 +1,129 @@
+"""
+For procedural tests needed for __torch_function__, we use this function
+to export method names and signatures as needed by the tests in
+test/test_overrides.py.
+
+python -m tools.autograd.gen_annotated_fn_args \
+       aten/src/ATen/native/native_functions.yaml \
+       aten/src/ATen/native/tags.yaml \
+       $OUTPUT_DIR \
+       tools/autograd
+
+Where $OUTPUT_DIR is where you would like the files to be
+generated.  In the full build system, OUTPUT_DIR is
+torch/testing/_internal/generated
+"""
+
+import argparse
+import os
+import textwrap
+from collections import defaultdict
+
+from typing import Any, Dict, List, Sequence
+
+import torchgen.api.python as python
+from torchgen.context import with_native_function
+
+from torchgen.gen import parse_native_yaml
+from torchgen.model import Argument, BaseOperatorName, NativeFunction
+from torchgen.utils import FileManager
+
+from .gen_python_functions import (
+    is_py_fft_function,
+    is_py_linalg_function,
+    is_py_nn_function,
+    is_py_special_function,
+    is_py_torch_function,
+    is_py_variable_method,
+    should_generate_py_binding,
+)
+
+
+def gen_annotated(
+    native_yaml_path: str, tags_yaml_path: str, out: str, autograd_dir: str
+) -> None:
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
+    mappings = (
+        (is_py_torch_function, "torch._C._VariableFunctions"),
+        (is_py_nn_function, "torch._C._nn"),
+        (is_py_linalg_function, "torch._C._linalg"),
+        (is_py_special_function, "torch._C._special"),
+        (is_py_fft_function, "torch._C._fft"),
+        (is_py_variable_method, "torch.Tensor"),
+    )
+    annotated_args: List[str] = []
+    for pred, namespace in mappings:
+        groups: Dict[BaseOperatorName, List[NativeFunction]] = defaultdict(list)
+        for f in native_functions:
+            if not should_generate_py_binding(f) or not pred(f):
+                continue
+            groups[f.func.name.name].append(f)
+        for group in groups.values():
+            for f in group:
+                annotated_args.append(f"{namespace}.{gen_annotated_args(f)}")
+
+    template_path = os.path.join(autograd_dir, "templates")
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_with_template(
+        "annotated_fn_args.py",
+        "annotated_fn_args.py.in",
+        lambda: {
+            "annotated_args": textwrap.indent("\n".join(annotated_args), "    "),
+        },
+    )
+
+
+@with_native_function
+def gen_annotated_args(f: NativeFunction) -> str:
+    def _get_kwargs_func_exclusion_list() -> List[str]:
+        # functions that currently don't work with kwargs in test_overrides.py
+        return [
+            "diagonal",
+            "round_",
+            "round",
+            "scatter_",
+        ]
+
+    def _add_out_arg(
+        out_args: List[Dict[str, Any]], args: Sequence[Argument], *, is_kwarg_only: bool
+    ) -> None:
+        for arg in args:
+            if arg.default is not None:
+                continue
+            out_arg: Dict[str, Any] = {}
+            out_arg["is_kwarg_only"] = str(is_kwarg_only)
+            out_arg["name"] = arg.name
+            out_arg["simple_type"] = python.argument_type_str(
+                arg.type, simple_type=True
+            )
+            size_t = python.argument_type_size(arg.type)
+            if size_t:
+                out_arg["size"] = size_t
+            out_args.append(out_arg)
+
+    out_args: List[Dict[str, Any]] = []
+    _add_out_arg(out_args, f.func.arguments.flat_positional, is_kwarg_only=False)
+    if f"{f.func.name.name}" not in _get_kwargs_func_exclusion_list():
+        _add_out_arg(out_args, f.func.arguments.flat_kwarg_only, is_kwarg_only=True)
+
+    return f"{f.func.name.name}: {repr(out_args)},"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate annotated_fn_args script")
+    parser.add_argument(
+        "native_functions", metavar="NATIVE", help="path to native_functions.yaml"
+    )
+    parser.add_argument("tags", metavar="TAGS", help="path to tags.yaml")
+    parser.add_argument("out", metavar="OUT", help="path to output directory")
+    parser.add_argument(
+        "autograd", metavar="AUTOGRAD", help="path to template directory"
+    )
+    args = parser.parse_args()
+    gen_annotated(args.native_functions, args.tags, args.out, args.autograd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_autograd.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_autograd.py
new file mode 100644
index 0000000000000000000000000000000000000000..f84ab186a92234a9fb1a2c7ad7ed349af084c744
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_autograd.py
@@ -0,0 +1,146 @@
+"""
+To run this file by hand from the root of the PyTorch
+repository, run:
+
+python -m tools.autograd.gen_autograd \
+       aten/src/ATen/native/native_functions.yaml \
+       aten/src/ATen/native/tags.yaml \
+       $OUTPUT_DIR \
+       tools/autograd
+
+Where $OUTPUT_DIR is where you would like the files to be
+generated.  In the full build system, OUTPUT_DIR is
+torch/csrc/autograd/generated/
+"""
+
+# gen_autograd.py generates C++ autograd functions and Python bindings.
+#
+# It delegates to the following scripts:
+#
+#  gen_autograd_functions.py: generates subclasses of torch::autograd::Node
+#  gen_variable_type.py: generates VariableType.h which contains all tensor methods
+#  gen_python_functions.py: generates Python bindings to THPVariable
+#
+
+import argparse
+import os
+from typing import List
+
+from torchgen.api import cpp
+from torchgen.api.autograd import (
+    match_differentiability_info,
+    NativeFunctionWithDifferentiabilityInfo,
+)
+from torchgen.gen import parse_native_yaml
+from torchgen.selective_build.selector import SelectiveBuilder
+
+from . import gen_python_functions
+from .gen_autograd_functions import (
+    gen_autograd_functions_lib,
+    gen_autograd_functions_python,
+)
+from .gen_inplace_or_view_type import gen_inplace_or_view_type
+from .gen_trace_type import gen_trace_type
+from .gen_variable_factories import gen_variable_factories
+from .gen_variable_type import gen_variable_type
+from .gen_view_funcs import gen_view_funcs
+from .load_derivatives import load_derivatives
+
+
+def gen_autograd(
+    native_functions_path: str,
+    tags_path: str,
+    out: str,
+    autograd_dir: str,
+    operator_selector: SelectiveBuilder,
+    disable_autograd: bool = False,
+) -> None:
+    # Parse and load derivatives.yaml
+    differentiability_infos, used_dispatch_keys = load_derivatives(
+        os.path.join(autograd_dir, "derivatives.yaml"), native_functions_path, tags_path
+    )
+
+    template_path = os.path.join(autograd_dir, "templates")
+
+    native_funcs = parse_native_yaml(native_functions_path, tags_path).native_functions
+    fns = sorted(
+        filter(
+            operator_selector.is_native_function_selected_for_training, native_funcs
+        ),
+        key=lambda f: cpp.name(f.func),
+    )
+    fns_with_diff_infos: List[
+        NativeFunctionWithDifferentiabilityInfo
+    ] = match_differentiability_info(fns, differentiability_infos)
+
+    # Generate VariableType.h/cpp
+    if not disable_autograd:
+        gen_variable_type(
+            out,
+            native_functions_path,
+            tags_path,
+            fns_with_diff_infos,
+            template_path,
+            used_dispatch_keys,
+        )
+
+        gen_inplace_or_view_type(
+            out, native_functions_path, tags_path, fns_with_diff_infos, template_path
+        )
+
+        # operator filter not applied as tracing sources are excluded in selective build
+        gen_trace_type(out, native_funcs, template_path)
+    # Generate Functions.h/cpp
+    gen_autograd_functions_lib(out, differentiability_infos, template_path)
+
+    # Generate variable_factories.h
+    gen_variable_factories(out, native_functions_path, tags_path, template_path)
+
+    # Generate ViewFuncs.h/cpp
+    gen_view_funcs(out, fns_with_diff_infos, template_path)
+
+
+def gen_autograd_python(
+    native_functions_path: str,
+    tags_path: str,
+    out: str,
+    autograd_dir: str,
+) -> None:
+    differentiability_infos, _ = load_derivatives(
+        os.path.join(autograd_dir, "derivatives.yaml"), native_functions_path, tags_path
+    )
+
+    template_path = os.path.join(autograd_dir, "templates")
+
+    # Generate Functions.h/cpp
+    gen_autograd_functions_python(out, differentiability_infos, template_path)
+
+    # Generate Python bindings
+    deprecated_path = os.path.join(autograd_dir, "deprecated.yaml")
+    gen_python_functions.gen(
+        out, native_functions_path, tags_path, deprecated_path, template_path
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate autograd C++ files script")
+    parser.add_argument(
+        "native_functions", metavar="NATIVE", help="path to native_functions.yaml"
+    )
+    parser.add_argument("tags", metavar="NATIVE", help="path to tags.yaml")
+    parser.add_argument("out", metavar="OUT", help="path to output directory")
+    parser.add_argument(
+        "autograd", metavar="AUTOGRAD", help="path to autograd directory"
+    )
+    args = parser.parse_args()
+    gen_autograd(
+        args.native_functions,
+        args.tags,
+        args.out,
+        args.autograd,
+        SelectiveBuilder.get_nop_selector(),
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_autograd_functions.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_autograd_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b13f09c917cf47d84cf547c212aaa6de0b04362
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_autograd_functions.py
@@ -0,0 +1,912 @@
+# Generates C++ autograd functions for the derivatives of ATen operations
+#
+# This writes two files:
+#  Functions.h/cpp: subclasses of autograd::Node
+#  python_functions.h/cpp: Python bindings for the above classes
+#
+from typing import Dict, List, Sequence, Tuple
+
+from torchgen.api.autograd import (
+    Derivative,
+    DifferentiabilityInfo,
+    SavedAttribute,
+    uses_retain_variables,
+    uses_single_grad,
+)
+from torchgen.api.types import (
+    ArrayRefCType,
+    BaseCppType,
+    BaseCType,
+    Binding,
+    boolT,
+    doubleT,
+    intArrayRefT,
+    iTensorListRefT,
+    ListCType,
+    longT,
+    MutRefCType,
+    OptionalCType,
+    optionalIntArrayRefT,
+    optionalSymIntArrayRefT,
+    scalarT,
+    stringT,
+    symIntArrayRefT,
+    SymIntT,
+    TENSOR_LIST_LIKE_CTYPES,
+    tensorListT,
+    tensorT,
+    VectorCType,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.model import Argument, FunctionSchema
+from torchgen.utils import FileManager
+
+from .gen_inplace_or_view_type import VIEW_FUNCTIONS
+
+FUNCTION_DECLARATION = CodeTemplate(
+    """\
+#ifdef _WIN32
+struct ${op} : public ${superclass} {
+  TORCH_API ${op}() = default;
+#else
+struct TORCH_API ${op} : public ${superclass} {
+#endif
+  using ${superclass}::${superclass};
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "${op}"; }
+  void release_variables() override {
+    ${thread_lock}
+    ${release_variables}
+  }
+  ${will_release_variables}
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ${saved_variables}
+  ${saved_list_sizes}
+};
+"""
+)
+
+WILL_RELEASE_VARIABLES = CodeTemplate(
+    """\
+bool retain_variables = true;
+void will_release_variables() override {
+  retain_variables = false;
+}
+"""
+)
+
+FUNCTION_DEFINITION = CodeTemplate(
+    """\
+variable_list ${op}::apply(variable_list&& grads) {
+  ${thread_lock}
+  ${asserts}
+  IndexRangeGenerator gen;
+  ${compute_index_ranges}
+  variable_list grad_inputs(gen.size());
+  ${body}
+  return grad_inputs;
+}
+void ${op}::compiled_args(CompiledNodeArgs& args) {
+    ${compiled_args}
+}
+variable_list ${op}::apply_with_saved(const variable_list& grads, SwapSavedVariables& saved) {
+    ${apply_with_saved_before}
+    variable_list result = apply(variable_list(grads));
+    ${apply_with_saved_after}
+    return result;
+}
+"""
+)
+
+GRAD_INPUT_MASK = CodeTemplate(
+    """\
+  auto grad_input_mask = std::array<bool, ${n}>{
+    ${masks}
+  };\
+"""
+)
+
+DERIVATIVE_SINGLE = CodeTemplate(
+    """\
+if (task_should_compute_output({ ${name}_ix })) {
+  auto grad_result = ${derivative};
+  copy_range(grad_inputs, ${name}_ix, grad_result);
+}
+"""
+)
+
+# note(crcrpar): `self` argument and other optional positional argument
+# of foreach functions are basically a list of n `Tensor`s thus iterating over
+# `grads` in order to utilize and apply the existing derivative definitions
+# to each `Tensor`(s) of `self`, and the others.
+DERIVATIVE_SINGLE_FOREACH = CodeTemplate(
+    """\
+if (task_should_compute_output({ ${name}_ix })) {
+  std::vector<Tensor> grad_result;
+  grad_result.reserve(grads.size());
+  for (const auto & i : c10::irange(grads.size())) {
+    if (grads[i].defined()) {
+      grad_result.emplace_back(${derivative});
+    } else {
+      grad_result.emplace_back(Tensor());
+    }
+  }
+  copy_range(grad_inputs, ${name}_ix, grad_result);
+}
+"""
+)
+
+DERIVATIVE_MULTI_COPY_RANGE = CodeTemplate(
+    """\
+  if (task_should_compute_output({ ${name}_ix })) {
+    copy_range(grad_inputs, ${name}_ix, std::get<${i}>(grad_result));
+  }
+"""
+)
+
+DERIVATIVE_MULTI = CodeTemplate(
+    """\
+if (task_should_compute_output({ ${idx_ranges} })) {
+  ${grad_input_mask}
+  auto grad_result = ${derivative};
+  ${copy_ranges}
+}
+"""
+)
+
+# Generates python bindings
+#
+# This generates the definitions for:
+#   (1) The PyTypeObject for each backward grad_fn subclassing Node
+#   (2) The entry for PyTypeObject's tp_getset slot (an array of PyGetSetDef structs)
+#       We generate one PyGetSetDef struct for each of grad_fn's saved inputs and outputs
+#       Each PyGetSetDef has a function ptr to a getter, also defined here (3).
+#   (3) Getters for each of grad_fn's saved inputs and outputs.
+#
+PY_FUNCTION_DEFINITION = CodeTemplate(
+    """\
+static PyTypeObject ${op}Class;
+addClass<${op}>(module, ${op}Class, "${op}", ${op}_properties);
+"""
+)
+
+PY_FUNCTION_PROPS_AND_GETTERS = CodeTemplate(
+    """\
+${all_getter_definitions}
+
+static struct PyGetSetDef ${op}_properties[] = {
+  THP_FUNCTION_DEFAULT_PROPERTIES,
+  ${all_getsetdef_structs}
+  {nullptr} /* sentinel */
+};
+
+"""
+)
+
+PY_GETSETDEF_STRUCT = CodeTemplate(
+    """\
+{(char*)"_saved_${name}", (getter)THP${op}_${name}_getter, nullptr, nullptr, nullptr}"""
+)
+
+PY_RAW_GETSETDEF_STRUCT = CodeTemplate(
+    """\
+{(char*)"_raw_saved_${name}", (getter)THP${op}_${name}_raw_getter, nullptr, nullptr, nullptr}"""
+)
+
+# Getter templates
+GETTER_DEFINITION = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  auto prop = static_cast<${op}*>(self->cdata.get())->${name};
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_;
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_RAW_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_;
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_VEC_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto *node = static_cast<${op}*>(self->cdata.get());
+  const auto& prop = node->${name}_;
+  if (node->${name}_released_) {
+    PyErr_SetString(PyExc_RuntimeError, ERR_BACKWARD_TWICE);
+    return nullptr;
+  }
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_RAW_VEC_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto *node = static_cast<${op}*>(self->cdata.get());
+  const auto& prop = node->${name}_;
+  if (node->${name}_released_) {
+    PyErr_SetString(PyExc_RuntimeError, ERR_BACKWARD_TWICE);
+    return nullptr;
+  }
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_OPT = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name};
+  if (!opt_prop.has_value()) {
+    Py_RETURN_NONE;
+  }
+  auto prop = opt_prop.value();
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_OPT_ARRAYREF = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name};
+  if (!opt_prop.list.has_value()) {
+    Py_RETURN_NONE;
+  }
+  auto prop = opt_prop.list.value();
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+# Getter body
+GETTER_BODY_SAVEDVAR = """\
+return THPVariable_Wrap(prop.unpack(self->cdata));
+"""
+
+GETTER_BODY_RAW_SAVEDVAR = """\
+pybind11::object obj = pybind11::cast(prop, pybind11::return_value_policy::reference);
+return obj.release().ptr();
+"""
+
+GETTER_BODY_VEC_SAVEDVAR = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i: c10::irange(prop.size())) {
+  PyTuple_SetItem(tup, (Py_ssize_t) i, THPVariable_Wrap(prop[i].unpack(self->cdata)));
+}
+return tup;
+"""
+
+GETTER_BODY_RAW_VEC_SAVEDVAR = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+  pybind11::object obj = pybind11::cast(prop[i], pybind11::return_value_policy::reference);
+  PyTuple_SetItem(tup, (Py_ssize_t) i, obj.release().ptr());
+}
+return tup;
+"""
+
+GETTER_BODY_ARRAYREF_LONG = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+  PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromUnsignedLong((uint64_t) prop[i]));
+}
+return tup;
+"""
+
+GETTER_BODY_ARRAYREF_SYMINT = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+    auto si = prop[i];
+    if (auto m = si.maybe_as_int()) {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromUnsignedLong(*m));
+    } else {
+      auto py_symint = py::cast(si).release().ptr();
+      PyTuple_SetItem(tup, (Py_ssize_t) i, py_symint);
+    }
+}
+return tup;
+"""
+
+GETTER_BODY_ARRAYREF_DOUBLE = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+  PyTuple_SetItem(tup, (Py_ssize_t) i, PyFloat_FromDouble((double) prop[i]));
+}
+return tup;
+"""
+
+GETTER_BODY_INT64_T = """\
+return PyLong_FromUnsignedLong((int64_t) prop);
+"""
+
+GETTER_BODY_SYMINT = """\
+if (auto m = prop.maybe_as_int()) {
+  return PyLong_FromUnsignedLong(*m);
+} else {
+  return py::cast(prop).release().ptr();
+}
+"""
+
+GETTER_BODY_DOUBLE = """\
+return PyFloat_FromDouble((double) prop);
+"""
+
+GETTER_BODY_BOOL = """\
+if (prop) {
+  Py_RETURN_TRUE;
+} else {
+  Py_RETURN_FALSE;
+}
+"""
+
+GETTER_BODY_STRING = """\
+return PyUnicode_FromStringAndSize(prop.data(), prop.size());
+"""
+
+GETTER_BODY_SCALAR = """\
+if (prop.isComplex()) {
+  auto cprop = prop.to<c10::complex<double>>();
+  return PyComplex_FromDoubles(cprop.real(), cprop.imag());
+} else if (prop.isFloatingPoint()) {
+  return PyFloat_FromDouble(prop.to<double>());
+} else if (prop.isIntegral(/*includeBool=*/false)) {
+  return PyLong_FromLong(prop.to<int64_t>());
+} else if (prop.isBoolean()) {
+  if (prop.to<bool>()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+} else {
+  PyErr_SetString(PyExc_RuntimeError, "Unknown scalar type");
+  return nullptr;
+}
+"""
+
+
+GETTER_BODY_VEC_SCALAR = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i: c10::irange(prop.size())) {
+  if (prop[i].isComplex()) {
+    auto cprop = prop[i].to<c10::complex<double>>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyComplex_FromDoubles(cprop.real(), cprop.imag()));
+  } else if (prop[i].isFloatingPoint()) {
+    auto double_prop = prop[i].to<double>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyFloat_FromDouble(double_prop));
+  } else if (prop[i].isIntegral(/*includeBool=*/false)) {
+    auto long_prop = prop[i].to<int64_t>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromLong(long_prop));
+  } else if (prop[i].isBoolean()) {
+    if (prop[i].to<bool>()) {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, Py_True);
+    } else {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, Py_False);
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, "Unknown scalar type");
+    return nullptr;
+  }
+}
+return tup;
+"""
+
+
+MISC_GETTER_DEFS = {
+    OptionalCType(BaseCType(longT)): (GETTER_DEFINITION_OPT, GETTER_BODY_INT64_T),
+    OptionalCType(BaseCType(SymIntT)): (GETTER_DEFINITION_OPT, GETTER_BODY_SYMINT),
+    BaseCType(doubleT): (GETTER_DEFINITION, GETTER_BODY_DOUBLE),
+    OptionalCType(BaseCType(doubleT)): (GETTER_DEFINITION_OPT, GETTER_BODY_DOUBLE),
+    BaseCType(boolT): (GETTER_DEFINITION, GETTER_BODY_BOOL),
+    BaseCType(scalarT): (GETTER_DEFINITION, GETTER_BODY_SCALAR),
+    OptionalCType(BaseCType(scalarT)): (GETTER_DEFINITION_OPT, GETTER_BODY_SCALAR),
+}
+
+# These functions have backwards which cannot be traced, and so must have
+# their backward functions traced opaquely.
+# VIEW_FUNCTIONS are not traceable because they use as_strided, which
+# has an untraceable backwards, see
+# https://github.com/pytorch/pytorch/issues/4250
+# TODO: This is probably not exhaustive, but it's a start
+UNTRACEABLE_FUNCTIONS = VIEW_FUNCTIONS
+
+
+def get_infos_with_derivatives_list(
+    differentiability_infos: Dict[FunctionSchema, Dict[str, DifferentiabilityInfo]]
+) -> List[DifferentiabilityInfo]:
+    diff_info_list = [
+        info
+        for diffinfo_dict in differentiability_infos.values()
+        for info in diffinfo_dict.values()
+    ]
+
+    return list(filter(lambda info: info.args_with_derivatives, diff_info_list))
+
+
+def gen_autograd_functions_lib(
+    out: str,
+    differentiability_infos: Dict[FunctionSchema, Dict[str, DifferentiabilityInfo]],
+    template_path: str,
+) -> None:
+    """Functions.h and Functions.cpp body
+
+    These contain the auto-generated subclasses of torch::autograd::Node
+    for each every differentiable torch function.
+    """
+
+    # get a 1D list of diffinfos, we do not need them to be per FunctionSchema/DispatchKey here
+    # infos with the diff dispatchkeys but the same name will still be in the same shard.
+    infos = get_infos_with_derivatives_list(differentiability_infos)
+    declarations = [process_function(f, FUNCTION_DECLARATION) for f in infos]
+    definitions = [process_function(f, FUNCTION_DEFINITION) for f in infos]
+
+    file_basename = "Functions"
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    for suffix in [".h", ".cpp"]:
+        fname = file_basename + suffix
+        fm.write_with_template(
+            fname,
+            fname,
+            lambda: {
+                "generated_comment": "@"
+                + f"generated from {fm.template_dir_for_comments()}/"
+                + fname,
+                "autograd_function_declarations": declarations,
+                "autograd_function_definitions": definitions,
+            },
+        )
+
+
+def gen_autograd_functions_python(
+    out: str,
+    differentiability_infos: Dict[FunctionSchema, Dict[str, DifferentiabilityInfo]],
+    template_path: str,
+) -> None:
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    num_shards = 5
+    fm.write(
+        "python_functions.h",
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/python_functions.h",
+            "shard_forward_declare": [
+                f"void initialize_autogenerated_functions_{i}(PyObject* module);"
+                for i in range(num_shards)
+            ],
+            "shard_call": [
+                f"initialize_autogenerated_functions_{i}(module);"
+                for i in range(num_shards)
+            ],
+        },
+    )
+
+    # get a 1D list of diffinfos, we do not need them to be per FunctionSchema/DispatchKey here
+    # infos with the diff dispatchkeys but the same name will still be in the same shard.
+    infos = get_infos_with_derivatives_list(differentiability_infos)
+    fm.write_sharded(
+        "python_functions.cpp",
+        infos,
+        key_fn=lambda info: info.name,
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/python_functions.cpp",
+        },
+        env_callable=lambda info: {
+            "py_function_initializers": [
+                process_function(info, PY_FUNCTION_DEFINITION)
+            ],
+            "py_function_props_and_getters": [
+                process_function(info, PY_FUNCTION_PROPS_AND_GETTERS)
+            ],
+        },
+        num_shards=num_shards,
+        sharded_keys={"py_function_initializers", "py_function_props_and_getters"},
+    )
+
+
+def process_function(info: DifferentiabilityInfo, template: CodeTemplate) -> str:
+    saved_variables: List[str] = []
+    release_variables: List[str] = []
+    saved_list_sizes: List[str] = []
+    unpack: List[str] = []
+    asserts: List[str] = []
+    compute_index_ranges: List[str] = []
+    getter_definitions: List[str] = []
+    py_getsetdef_structs: List[str] = []
+    compiled_args: List[str] = []
+    apply_with_saved_before: List[str] = []
+    apply_with_saved_after: List[str] = []
+
+    for arg in info.args_with_derivatives:
+        if arg.type in TENSOR_LIST_LIKE_CTYPES:
+            size = f"{arg.name}_size_"
+            saved_list_sizes.append(f"size_t {arg.name}_size_;")
+        else:
+            size = "1"
+        compute_index_ranges.append(f"auto {arg.name}_ix = gen.range({size});")
+
+    def save_var(var: SavedAttribute, is_output: bool) -> None:
+        name = var.nctype.name
+        type = var.nctype.type
+        should_append_getsetdef = True
+        should_append_raw_getsetdef = False
+        visit_name = name
+
+        if (
+            type == BaseCType(tensorT)
+            or type == OptionalCType(BaseCType(tensorT))
+            or type == MutRefCType(OptionalCType(BaseCType(tensorT)))
+            or (type == BaseCType(scalarT) and is_output)
+        ):
+            saved_variables.append(f"SavedVariable {name}_;")
+            release_variables.append(f"{name}_.reset_data();")
+            ptr = "shared_from_this()" if is_output else ""
+            unpack.append(f"auto {name} = {name}_.unpack({ptr});")
+            getter_definitions.append(
+                GETTER_DEFINITION_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_SAVEDVAR
+                )
+            )
+            should_append_raw_getsetdef = True
+            visit_name = f"{name}_"
+        elif (
+            type == BaseCType(tensorListT)
+            or type == BaseCType(iTensorListRefT)
+            or type == VectorCType(BaseCType(tensorT))
+        ):
+            # note(crcrpar): [nuanced return type of out-of-place foreach functions]
+            # When an out-of-place foreach function whose return signature is `Tensor[]`
+            # spells out its backward definitions in `derivatives.yaml`, and some of them depend on
+            # `result`, `result`'s type is interpreted and treated as `std::vector<Tensor>`.
+            # An out-of-place foreach whose backwards rely on their output doesn't suffer from this
+            # difference if the definitions are codegen'ed.
+            # This special case is needed for `_foreach_pow.List` and `_foreach_pow.ScalarAndTensor`
+            # as of https://github.com/pytorch/pytorch/pull/105504.
+            if type == VectorCType(BaseCType(tensorT)):
+                assert (
+                    info.func.func.name.name.base.startswith("_foreach") and is_output
+                )
+            saved_variables.append(f"std::vector<SavedVariable> {name}_;")
+            saved_variables.append(f"bool {name}_released_ = false;")
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f"{name}_.clear();")
+            release_variables.append(f"{name}_released_ = true;")
+            ptr = "shared_from_this()" if is_output else "nullptr"
+            unpack.append(f"auto {name} = unpack_list({name}_, {ptr});")
+            asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                GETTER_DEFINITION_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR
+                )
+            )
+            should_append_raw_getsetdef = True
+            visit_name = f"{name}_"
+        elif type == ListCType(OptionalCType(BaseCType(tensorT))):
+            saved_variables.append(f"std::vector<SavedVariable> {name}_;")
+            saved_variables.append(f"bool {name}_released_ = false;")
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f"{name}_.clear();")
+            release_variables.append(f"{name}_released_ = true;")
+            unpack.append(f"auto {name} = unpack_opt_list({name}_);")
+            asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                GETTER_DEFINITION_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR
+                )
+            )
+            should_append_raw_getsetdef = True
+            visit_name = f"{name}_"
+        elif type == BaseCType(intArrayRefT):
+            saved_variables.append(f"std::vector<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
+        elif type == BaseCType(symIntArrayRefT):
+            saved_variables.append(f"std::vector<c10::SymInt> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_SYMINT
+                )
+            )
+        elif type == BaseCType(optionalIntArrayRefT):
+            saved_variables.append(f"c10::OptionalArray<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
+        elif type == BaseCType(optionalSymIntArrayRefT):
+            saved_variables.append(f"c10::OptionalArray<c10::SymInt> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_SYMINT
+                )
+            )
+        elif type == OptionalCType(BaseCType(intArrayRefT)):
+            saved_variables.append(f"c10::OptionalArray<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
+        elif type == OptionalCType(BaseCType(symIntArrayRefT)):
+            saved_variables.append(f"c10::OptionalArray<c10::SymInt> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_SYMINT
+                )
+            )
+        elif type == OptionalCType(ArrayRefCType(BaseCType(doubleT))):
+            saved_variables.append(f"c10::OptionalArray<double> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_DOUBLE
+                )
+            )
+        elif type == BaseCType(longT):
+            saved_variables.append(f"{type.cpp_type()} {name} = 0;")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_INT64_T
+                )
+            )
+        elif type == BaseCType(SymIntT):
+            saved_variables.append(f"c10::SymInt {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_SYMINT
+                )
+            )
+        elif type == BaseCType(stringT):
+            saved_variables.append(f"std::string {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_STRING
+                )
+            )
+        elif type == OptionalCType(BaseCType(stringT)):
+            saved_variables.append(f"c10::optional<std::string> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_STRING
+                )
+            )
+        elif type == ArrayRefCType(
+            elem=BaseCType(type=BaseCppType(ns="at", name="Scalar"))
+        ):
+            saved_variables.append(f"std::vector<at::Scalar> {name};")
+            saved_variables.append(f"bool {name}_released_ = false;")
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f"{name}.clear();")
+            # release_variables.append(f"{name}_released_ = true;")
+            # unpack.append(f"auto {name} = unpack_list({name}_);")
+            # asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                CodeTemplate(
+                    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto *node = static_cast<${op}*>(self->cdata.get());
+  const auto& prop = node->${name};
+  if (node->${name}_released_) {
+    PyErr_SetString(PyExc_RuntimeError, ERR_BACKWARD_TWICE);
+    return nullptr;
+  }
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+                            """
+                ).substitute(
+                    op=info.op,
+                    name=name,
+                    body=GETTER_BODY_VEC_SCALAR,
+                )
+            )
+        else:
+            # Check for indicators that you're putting a non-owning reference
+            # into the saved variable field.  If this is spuriously firing,
+            # edit this field.  Otherwise, you probably need to add a case
+            # above.
+            assert (
+                "ref" not in type.cpp_type().lower()
+                and "view" not in type.cpp_type().lower()
+                and "*" not in type.cpp_type()
+                and "&" not in type.cpp_type()
+            ), f"{type.cpp_type()} looks like it contains a non-owning reference"
+            saved_variables.append(f"{type.cpp_type()} {name};")
+
+            if type in MISC_GETTER_DEFS:
+                getter_def, body = MISC_GETTER_DEFS[type]
+                getter_definitions.append(
+                    getter_def.substitute(op=info.op, name=name, body=body)
+                )
+            else:
+                # Types we don't expose python bindings to yet:
+                #   TypeAndSize, at::ScalarType, TensorOptions, TensorGeometry,
+                #   std::vector<std::vector<int64_t>>, std::vector<at::ScalarType>
+                should_append_getsetdef = False
+
+        if should_append_getsetdef:
+            py_getsetdef_structs.append(
+                PY_GETSETDEF_STRUCT.substitute(op=info.op, name=name)
+            )
+        if should_append_raw_getsetdef:
+            py_getsetdef_structs.append(
+                PY_RAW_GETSETDEF_STRUCT.substitute(op=info.op, name=name)
+            )
+
+        compiled_args.append(f"args.collect({visit_name});")
+        apply_with_saved_before.append(f"saved.before({visit_name});")
+        apply_with_saved_after.append(f"saved.after({visit_name});")
+
+    for var in sorted(info.all_saved_inputs, key=lambda sa: str(sa.nctype.name)):
+        save_var(var, is_output=False)
+    for var in sorted(info.all_saved_outputs, key=lambda sa: str(sa.nctype.name)):
+        save_var(var, is_output=True)
+
+    # lock the mutex when we release variables and in Node::apply to protect thread safety
+    # see Note [Thread Safety on Autograd Node]
+    if len(release_variables) > 0:
+        thread_lock = "std::lock_guard<std::mutex> lock(mutex_);"
+    else:
+        thread_lock = ""
+
+    if uses_retain_variables(info):
+        will_release_variables = WILL_RELEASE_VARIABLES.substitute()
+    else:
+        will_release_variables = ""
+
+    body: List[str] = []
+
+    if uses_single_grad(info):
+        body.append("const auto& grad = grads[0];")
+    else:
+        # Generate aliases for gradients named for returned values.
+        body.extend(
+            f"const auto& {name} = grads[{info.available_named_gradients.index(name)}];"
+            for name in sorted(info.used_named_gradients)
+        )
+
+    def emit_derivative(
+        derivative: Derivative,
+        args_with_derivatives: Sequence[Binding],
+    ) -> Tuple[bool, str]:
+        formula = derivative.formula
+        var_names = derivative.var_names
+        if len(var_names) == 1:
+            checks_any_grad_defined = False
+            if "not_implemented" not in formula:
+                matching_args = [
+                    arg for arg in args_with_derivatives if arg.name == var_names[0]
+                ]
+                if len(matching_args) == 1:
+                    # We can add undefined grad support if the input variable is a Tensor
+                    arg = matching_args[0]
+                    if isinstance(arg.argument, Argument) and str(
+                        arg.argument.type
+                    ) in ("Tensor", "Tensor?"):
+                        formula = "any_grad_defined ? (" + formula + ") : Tensor()"
+                        checks_any_grad_defined = True
+            if info.name.startswith("_foreach_"):
+                derivative_template = DERIVATIVE_SINGLE_FOREACH
+            else:
+                derivative_template = DERIVATIVE_SINGLE
+            return (
+                checks_any_grad_defined,
+                derivative_template.substitute(name=var_names[0], derivative=formula),
+            )
+        else:
+            if "grad_input_mask" in formula:
+                masks = [
+                    f"task_should_compute_output({{ {n}_ix }})," for n in var_names
+                ]
+                grad_input_mask = GRAD_INPUT_MASK.substitute(
+                    masks=masks, n=len(var_names)
+                )
+            else:
+                grad_input_mask = ""
+            idx_ranges = ", ".join(f"{n}_ix" for n in var_names)
+            copy_ranges: List[str] = []
+            for i, n in enumerate(var_names):
+                copy_ranges.append(DERIVATIVE_MULTI_COPY_RANGE.substitute(name=n, i=i))
+            return False, DERIVATIVE_MULTI.substitute(
+                idx_ranges=idx_ranges,
+                copy_ranges=copy_ranges,
+                derivative=formula,
+                grad_input_mask=grad_input_mask,
+            )
+
+    body.extend(unpack)
+    need_any_grad_defined_var = False
+    for derivative in info.derivatives:
+        checks_any_grad_defined, derivative_text = emit_derivative(
+            derivative, info.args_with_derivatives
+        )
+        body.append(derivative_text)
+        need_any_grad_defined_var |= checks_any_grad_defined
+    # Since single-output derivative formulas need to check if grads are
+    # defined, only perform the check once, before all the formulas
+    if need_any_grad_defined_var:
+        body.insert(
+            -len(info.derivatives),
+            "bool any_grad_defined = any_variable_defined(grads);",
+        )
+
+    if info.name in UNTRACEABLE_FUNCTIONS:
+        superclass = "Node"
+    else:
+        superclass = "TraceableFunction"
+
+    all_getsetdef_structs = (
+        ",\n".join(py_getsetdef_structs) + "," if len(py_getsetdef_structs) != 0 else ""
+    )
+    all_getter_definitions = "\n".join(getter_definitions)
+
+    return template.substitute(
+        op=info.op,
+        compute_index_ranges=compute_index_ranges,
+        saved_variables=saved_variables,
+        release_variables=release_variables,
+        saved_list_sizes=saved_list_sizes,
+        asserts=asserts,
+        thread_lock=thread_lock,
+        will_release_variables=will_release_variables,
+        body=body,
+        superclass=superclass,
+        all_getter_definitions=all_getter_definitions,
+        all_getsetdef_structs=all_getsetdef_structs,
+        compiled_args=compiled_args,
+        apply_with_saved_before=apply_with_saved_before,
+        apply_with_saved_after=apply_with_saved_after,
+    )
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_inplace_or_view_type.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_inplace_or_view_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f7ba98fbee11686637d2862c456b646f94f7425
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_inplace_or_view_type.py
@@ -0,0 +1,675 @@
+# Generates ADInplaceOrViewType.h/cpp
+#
+# NOTE: If any changes are being made to the ADInplaceOrView codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+# The fallback is expected to mimick this codegen, so we should keep the two in sync.
+
+from typing import Dict, List, Optional, Tuple
+
+from torchgen.api import cpp
+from torchgen.api.autograd import (
+    dispatch_strategy,
+    gen_differentiable_outputs,
+    NativeFunctionWithDifferentiabilityInfo,
+)
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    boolT,
+    ConstRefCType,
+    CType,
+    DispatcherSignature,
+    intArrayRefT,
+    longT,
+    OptionalCType,
+    symIntArrayRefT,
+    SymIntT,
+    # See Note [Nested Arg Types]
+    tensorT,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.context import with_native_function
+from torchgen.model import (
+    NativeFunction,
+    SchemaKind,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import FileManager
+
+from .context import with_native_function_with_differentiability_info
+from .gen_trace_type import (
+    get_return_value,
+    MANUAL_AUTOGRAD,
+    tie_return_values,
+    type_wrapper_name,
+)
+
+# See NOTE [ Autograd View Variables ] in variable.h for details.
+# If you update list VIEW_FUNCTIONS or RETURNS_VIEWS_OF_INPUT,
+# you **MUST** also update the public list of view ops accordingly in
+# docs/source/tensor_view.rst. Note not all ATen functions are exposed to public,
+# e.g alias & sparse_coo_tensor_with_dims_and_tensors.
+#
+# A map: function name => name of the argument that all outputs are view of
+
+VIEW_FUNCTIONS_WITH_METADATA_CHANGE = [
+    "view_as_complex",
+    "view_as_real",
+    "_conj",
+    "_neg_view",
+    "_nested_get_values",
+    "_nested_view_from_buffer",
+    "_nested_view_from_jagged",
+]
+
+VIEW_FUNCTIONS = {
+    "numpy_T": "self",
+    "alias": "self",
+    "as_strided": "self",
+    "diagonal": "self",
+    "expand": "self",
+    "permute": "self",
+    "select": "self",
+    "slice": "self",
+    "slice_inverse": "self",
+    "split": "self",
+    "split_with_sizes": "self",
+    "squeeze": "self",
+    "t": "self",
+    "transpose": "self",
+    "unfold": "self",
+    "unsqueeze": "self",
+    "flatten": "self",
+    "view": "self",
+    "unbind": "self",
+    "_indices": "self",
+    "_values": "self",
+    "indices": "self",
+    "values": "self",
+    "crow_indices": "self",
+    "col_indices": "self",
+    "ccol_indices": "self",
+    "row_indices": "self",
+    # sparse_coo ctor output should really be views of both indices and values,
+    # but we only supports making as view of a single variable, and indices is
+    # discrete anyways.
+    # FIXME: clone indices on construction.
+    "sparse_coo_tensor_with_dims_and_tensors": "values",
+    "_reshape_alias": "self",
+    "_test_autograd_multiple_dispatch_view": "self",
+}
+
+for key in VIEW_FUNCTIONS_WITH_METADATA_CHANGE:
+    VIEW_FUNCTIONS[key] = "self"
+
+# note: some VIEW_FUNCTIONS are just compositions of the view functions above
+# this list contains both the root view functions and any that are purely composed
+# of viewing functions, and is used by the JIT to determine when an operator
+# may return a view of its inputs; however they may sometimes return a copy.
+# (e.g. `contiguous`)
+RETURNS_VIEWS_OF_INPUT = set(VIEW_FUNCTIONS.keys()).union(
+    {
+        "chunk",
+        "detach",
+        "contiguous",
+        "reshape",
+        "reshape_as",
+        "expand_as",
+        "view_as",
+        "real",
+        "imag",
+        "narrow",
+        "movedim",
+        "tensor_split",
+        "swapdims",
+        "swapaxes",
+        "mT",
+        "mH",
+        "adjoint",
+        "matrix_H",
+    }
+)
+
+# These are the functions we consider views for the purposes of validating
+# StorageImpl and TensorImpl in gen_variable_type.
+# `_unsafe_view` is not included in VIEW_FUNCTIONS above because it is not a
+# view for the purposes of ADInplaceOrView kernel, we do not want to call as_view
+# See NOTE [Unsafe View] for more info.
+ALL_VIEW_FUNCTIONS = {
+    **VIEW_FUNCTIONS,
+    "_unsafe_view": "self",
+}
+
+ARRAYREF_TO_VEC = CodeTemplate(
+    """\
+auto ${vec} = ${arg}.vec();
+"""
+)
+
+OPTIONAL_TO_VAL = CodeTemplate(
+    """\
+auto ${val} = ${arg}.value_or(${default});
+"""
+)
+
+CALL_DISPATCH = CodeTemplate(
+    """\
+at::_ops::${unambiguous_name}::call(${unpacked_args})"""
+)
+
+REVERSE_VIEW_DISPATCH = CodeTemplate(
+    """\
+${reverse_name}(${unpacked_args})"""
+)
+
+MULTI_OUTPUT_VIEW_ITERATION = CodeTemplate(
+    """\
+for (auto ${view_idx} : c10::irange(${var}.size())) {
+  ${body}
+}
+"""
+)
+
+SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE = CodeTemplate(
+    """\
+std::unique_ptr<torch::autograd::ViewFunc> func(nullptr);
+std::function<at::Tensor(const at::Tensor&)> rev_func=nullptr;
+if (${is_view_with_metadata_change} ||
+    !self.unsafeGetTensorImpl()->support_as_strided() ||
+    self.unsafeGetTensorImpl()->is_python_dispatch() ||
+    c10::AutogradState::get_tls_state().get_view_replay_enabled()) {
+  ${replay_view_func}
+  ${reverse_replay_view_func}
+}
+"""
+)
+
+REPLAY_VIEW_FUNC = CodeTemplate(
+    """\
+func = std::make_unique<${view_func_name}>(${view_func_args});
+"""
+)
+
+REVERSE_REPLAY_VIEW_LAMBDA_FUNC = CodeTemplate(
+    """\
+rev_func = [=](const at::Tensor& ${input_view}) {
+  return ${reverse_replay_view_call};
+};
+"""
+)
+
+METHOD_DEFINITION = CodeTemplate(
+    """\
+${return_type} ${type_wrapper_name}(${formals}) {
+  ${type_definition_body}
+}
+"""
+)
+
+WRAPPER_REGISTRATION = CodeTemplate(
+    """\
+m.impl("${unqual_operator_name_with_overload}",
+       TORCH_FN(${class_type}::${type_wrapper_name})
+);
+"""
+)
+
+AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION = CodeTemplate(
+    """\
+m.impl("${unqual_operator_name_with_overload}", torch::autograd::autogradNotImplementedFallback());
+"""
+)
+
+INPLACE_REDISPATCH = CodeTemplate(
+    """\
+{
+  at::AutoDispatchBelowADInplaceOrView guard;
+  at::_ops::${unambiguous_name}::redispatch(${unpacked_args});
+}
+"""
+)
+
+ASSIGN_RETURN_VALUE = CodeTemplate(
+    """\
+${return_values} = ${rhs_value};
+"""
+)
+
+VIEW_REDISPATCH = CodeTemplate(
+    """\
+${assign_return_values} ([&]() {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return at::_ops::${unambiguous_name}::redispatch(${unpacked_args});
+})();
+"""
+)
+
+TMP_VAR = "_tmp"
+
+
+# FIXME: Ideally these functions should be methods on Type class, but we have a
+#        comment in codegen/model.py there saying these concepts are not well defined.
+#        Thus we put a version that commonly used by autograd codegen here.
+def is_tensor_type(t: Type) -> bool:
+    # TODO: Should handle optional here?
+    return t.is_tensor_like() and t.is_list_like() is None
+
+
+def is_tensor_list_type(t: Type) -> bool:
+    # TODO: Should handle optional here?
+    return t.is_tensor_like() and t.is_list_like() is not None
+
+
+UNPACK_TENSOR = CodeTemplate(
+    """\
+auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});"""
+)
+
+
+def unpacked_name(arg_name: str) -> str:
+    return arg_name + "_"
+
+
+# e.g. select.int -> select_copy_int_inverse()
+def inverse_view_name(f: NativeFunction) -> str:
+    copy_variant = f"{f.root_name}_copy"
+    overload = f"{f.func.name.overload_name}"
+    if overload != "":
+        overload = "_" + overload
+    return f"{copy_variant}{overload}_inverse"
+
+
+def extract_bindings(f: NativeFunction) -> List[Binding]:
+    return [
+        r
+        for a in f.func.schema_order_arguments()
+        for r in cpp.argument(
+            a,
+            method=False,
+            symint=True,
+            cpp_no_default_args=set(),
+            faithful=False,
+            has_tensor_options=False,
+        )
+    ]
+
+
+@with_native_function
+def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]:
+    body: List[str] = []
+    unpacked_bindings: List[Binding] = []
+
+    for i, binding in enumerate(extract_bindings(f)):
+        assert not isinstance(binding.argument, SelfArgument)
+        if isinstance(binding.argument, TensorOptionsArguments):
+            raise RuntimeError("VariableKernel shouldn't take TensorOptions")
+
+        is_nullable = binding.argument.type.is_nullable()
+        if not binding.argument.type.is_tensor_like() or is_nullable:
+            unpacked_bindings.append(binding)
+            continue
+
+        is_tensor_list = is_tensor_list_type(binding.argument.type)
+        ref = (not is_nullable) and not is_tensor_list
+        suffix = "_opt" if is_nullable and not is_tensor_list else ""
+        body.append(
+            UNPACK_TENSOR.substitute(
+                arg_name=binding.name,
+                arg_pos=i,
+                suffix=suffix,
+                ref="&" if ref else "",
+            )
+        )
+        unpacked_bindings.append(
+            Binding(
+                name=unpacked_name(binding.name),
+                nctype=binding.nctype,
+                argument=binding.argument,
+                default=binding.default,
+            )
+        )
+
+    return body, unpacked_bindings
+
+
+def get_base_name(f: NativeFunction) -> str:
+    return f.func.name.name.base  # TODO: should be str(f.func.name.name)?
+
+
+def get_view_info(f: NativeFunction) -> Optional[str]:
+    base_name = get_base_name(f)
+    view_info = VIEW_FUNCTIONS.get(base_name, None)
+    if view_info is None and base_name in RETURNS_VIEWS_OF_INPUT:
+        view_info = "self"
+    return view_info
+
+
+def emit_view_func(
+    f: NativeFunction, bindings: List[Binding], view_idx: Optional[str] = None
+) -> str:
+    """Generate an additional lambda function to recover views in backward when as_strided is not supported.
+    See Note [View + Inplace update for base tensor] and [View + Inplace update for view tensor] for more details.
+    """
+    # TODO: Clean this logic up if we get rid of reverse view funcs or reify them.
+    input_base = "input_base"
+    replay_view_func = ""
+    updated_args: List[str] = []
+    known_view_arg_simple_types: List[CType] = [
+        BaseCType(longT),
+        OptionalCType(BaseCType(longT)),
+        BaseCType(SymIntT),
+        OptionalCType(BaseCType(SymIntT)),
+        BaseCType(boolT),
+        BaseCType(intArrayRefT),
+        BaseCType(symIntArrayRefT),
+        ConstRefCType(BaseCType(tensorT)),
+        ConstRefCType(OptionalCType(BaseCType(tensorT))),
+    ]
+    for binding in bindings:
+        arg, arg_type = binding.name, binding.nctype.type
+        if arg == "self":
+            updated_args.append(input_base)
+            continue
+        if arg_type not in known_view_arg_simple_types:
+            known_types_str = ", ".join([str(t) for t in known_view_arg_simple_types])
+            raise TypeError(
+                f"You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: "
+                f"{known_types_str}. Please update the list or materialize it so that it can be closed "
+                "over by value, also add a test in pytorch/xla/test/test_operations.py where this code "
+                "is exercised."
+            )
+        if arg_type == BaseCType(intArrayRefT) or arg_type == BaseCType(
+            symIntArrayRefT
+        ):
+            # It's not safe to close over IntArrayRef by value, since this is a
+            # reference type, so materialize a vector to close over by value
+            arg_vec = arg + "_vec"
+            replay_view_func += ARRAYREF_TO_VEC.substitute(arg=arg, vec=arg_vec)
+            updated_args.append(arg_vec)
+        elif arg_type == OptionalCType(BaseCType(longT)):
+            # Materialize int64_t? to int64_t
+            arg_value = arg + "_val"
+            replay_view_func += OPTIONAL_TO_VAL.substitute(
+                arg=arg, val=arg_value, default="0"
+            )
+            updated_args.append(arg_value)
+        elif arg_type == ConstRefCType(BaseCType(tensorT)) or arg_type == ConstRefCType(
+            OptionalCType(BaseCType(tensorT))
+        ):
+            # NB: Closing over a tensor. If a user modifies this tensor, this will be silently
+            # incorrect. The proper thing to do is to store the version counter and copy on write.
+            updated_args.append(arg)
+        else:
+            updated_args.append(arg)
+
+    from .gen_view_funcs import view_func_name
+
+    view_func_args = [b.name for b in bindings if b.name != "self"]
+    if view_idx is not None:
+        view_func_args.append(f"{view_idx}")
+    replay_view_func += REPLAY_VIEW_FUNC.substitute(
+        view_func_name=view_func_name(f, include_namespace=True),
+        view_func_args=view_func_args,
+    )
+
+    input_view = "input_view"
+    reverse_unpacked_args = [
+        "self",
+        f"{input_view}",
+        # inverse_return_mode=
+        "at::functionalization::InverseReturnMode::AlwaysView",
+        *(() if view_idx is None else (f"{view_idx}",)),
+        # skip input_base arg
+        *updated_args[1:],
+    ]
+
+    from torchgen.api.functionalization import reverse_name
+
+    reverse_replay_view_call = REVERSE_VIEW_DISPATCH.substitute(
+        reverse_name=reverse_name(f, include_namespace=True),
+        unpacked_args=reverse_unpacked_args,
+    )
+    reverse_replay_view_func = REVERSE_REPLAY_VIEW_LAMBDA_FUNC.substitute(
+        input_view=input_view, reverse_replay_view_call=reverse_replay_view_call
+    )
+
+    is_view_with_metadata_change = (
+        "true" if cpp.name(f.func) in VIEW_FUNCTIONS_WITH_METADATA_CHANGE else "false"
+    )
+
+    return SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE.substitute(
+        is_view_with_metadata_change=is_view_with_metadata_change,
+        replay_view_func=replay_view_func,
+        reverse_replay_view_func=reverse_replay_view_func,
+    )
+
+
+def emit_view_body(
+    fn: NativeFunctionWithDifferentiabilityInfo, var: str
+) -> Tuple[str, str]:
+    # See NOTE [ Autograd View Variables ] in variable.h for details.
+    f = fn.func
+    base_name = get_base_name(f)
+    view_info = get_view_info(f)
+    call = ""
+    differentiable_outputs = gen_differentiable_outputs(fn)
+    differentiable_output_vars = {r.name for r in differentiable_outputs}
+    if not isinstance(view_info, str):
+        raise TypeError(
+            f"The view info should be a string for {base_name}, but it is: {view_info}"
+        )
+    if len(differentiable_output_vars) == 0:
+        # no output is differentiable (.indices() for SparseTensors for example)
+        rhs_value = (
+            f"as_view({view_info}, {var}, "
+            f"/* is_bw_differentiable */ false, /* is_fw_differentiable */ false)"
+        )
+    elif len(differentiable_output_vars) == 1:
+        # Single differentiable output (Tensor or Tensor[])
+        return_info = differentiable_outputs[0]
+        # We only support simple Tensor or a TensorList for functions that return views
+        if not is_tensor_type(return_info.type) and not is_tensor_list_type(
+            return_info.type
+        ):
+            raise RuntimeError(
+                f"{base_name} that return differentiable views can only return Tensor or Tensor[]"
+            )
+
+        # See Note [ View + Inplace detection]
+        def get_creation_meta_in_mode(original: str) -> str:
+            creation_meta_with_grad_mode = f"(at::GradMode::is_enabled() ? {original} : CreationMeta::NO_GRAD_MODE)"
+            return f"InferenceMode::is_enabled() ? CreationMeta::INFERENCE_MODE : {creation_meta_with_grad_mode}"
+
+        # Only allow rebasing of the history if we return a single Tensor
+        # If we are in a no grad block, raise a warning
+        # See NOTE [ View + Inplace detection ] for more details about this logic
+        if is_tensor_list_type(return_info.type):
+            creation_meta = get_creation_meta_in_mode("CreationMeta::MULTI_OUTPUT_NODE")
+            view_idx = "view_idx"
+            view_func = emit_view_func(
+                f, extract_bindings(f), view_idx=view_idx
+            ).strip()
+            as_view_call = (
+                f"as_view(/* base */ {view_info}, /* output */ {var}[{view_idx}], "
+                "/* is_bw_differentiable */ true, /* is_fw_differentiable */ true, "
+                "/* view_func */ std::move(func), /* rev_view_func */ rev_func, "
+                f"/* creation_meta */ {creation_meta});"
+            )
+            call += MULTI_OUTPUT_VIEW_ITERATION.substitute(
+                var=var, view_idx=view_idx, body=f"{view_func}\n{as_view_call}"
+            )
+            rhs_value = f"std::move({var})"
+        else:
+            call += emit_view_func(f, extract_bindings(f), view_idx=None)
+            creation_meta = get_creation_meta_in_mode("CreationMeta::DEFAULT")
+            rhs_value = (
+                f"as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, "
+                "/* is_fw_differentiable */ true, "
+                f"/* view_func */ std::move(func), /* rev_view_func */ rev_func, /* creation_meta */ {creation_meta})"
+            )
+    else:
+        # This could be supported but we don't need it at the moment, so keeping things simple.
+        raise RuntimeError(
+            "Function that return multiple differentiable output "
+            "when at least one of them is view is not supported."
+        )
+    return call, rhs_value
+
+
+def modifies_arguments(f: NativeFunction) -> bool:
+    return f.func.kind() in [SchemaKind.inplace, SchemaKind.out]
+
+
+@with_native_function_with_differentiability_info
+def emit_inplace_or_view_body(fn: NativeFunctionWithDifferentiabilityInfo) -> List[str]:
+    f = fn.func
+    inplace_view_body: List[str] = []
+
+    dispatcher_sig = DispatcherSignature.from_schema(f.func)
+    dispatcher_exprs = dispatcher_sig.exprs()
+
+    # code-generated ADInplaceOrView kernels plumb and recompute dispatch keys directly through the kernel for performance.
+    # See Note [Plumbing Keys Through The Dispatcher] for details.
+    dispatch_key_set = "ks & c10::after_ADInplaceOrView_keyset"
+    redispatch_args = ", ".join([dispatch_key_set] + [a.expr for a in dispatcher_exprs])
+
+    # Note that this calls the slow, dispatching variants of manual_cpp_binding ops.
+    # We could probably work harder to ensure that the fast variants are called instead, but the perf benefit would be minimal.
+    if modifies_arguments(f):  # inplace op
+        inplace_view_body.append(
+            INPLACE_REDISPATCH.substitute(
+                unambiguous_name=f.func.name.unambiguous_name(),
+                unpacked_args=redispatch_args,
+            )
+        )
+        for r in cpp.return_names(f):
+            inplace_view_body.append(f"increment_version({r});")
+    else:
+        assert get_view_info(f) is not None
+        inplace_view_body.append(
+            VIEW_REDISPATCH.substitute(
+                assign_return_values="auto " + TMP_VAR + " = ",
+                unambiguous_name=f.func.name.unambiguous_name(),
+                unpacked_args=redispatch_args,
+            )
+        )
+        call, rhs_value = emit_view_body(fn, TMP_VAR)
+        inplace_view_body.append(call)
+        assert rhs_value is not None
+        inplace_view_body.append(
+            ASSIGN_RETURN_VALUE.substitute(
+                return_values=tie_return_values(f), rhs_value=rhs_value
+            )
+        )
+    if f.func.returns:
+        inplace_view_body.append(f"return {get_return_value(f)};")
+    return inplace_view_body
+
+
+@with_native_function
+def gen_formals(f: NativeFunction) -> str:
+    return ", ".join(
+        # code-generated autograd kernels plumb and recompute dispatch keys directly through the kernel for performance.
+        # See Note [Plumbing Keys Through The Dispatcher] for details.
+        ["c10::DispatchKeySet ks"]
+        + [
+            f'{cpp.argument_type(a, binds="__placeholder__", symint=True).cpp_type()} {a.name}'
+            for a in f.func.schema_order_arguments()
+        ]
+    )
+
+
+@with_native_function_with_differentiability_info
+def inplace_or_view_method_definition(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> Optional[str]:
+    f = fn.func
+    if get_view_info(f) is None and (
+        # For functions that modify their inputs but don't return them,
+        # we can't give them autograd support.
+        # See https://github.com/pytorch/pytorch/issues/53796
+        not modifies_arguments(f)
+        or len(f.func.returns) == 0
+    ):
+        return None
+    return METHOD_DEFINITION.substitute(
+        return_type=cpp.returns_type(f.func.returns, symint=True).cpp_type(),
+        type_wrapper_name=type_wrapper_name(f),
+        formals=gen_formals(f),
+        type_definition_body=emit_inplace_or_view_body(fn),
+    )
+
+
+@with_native_function_with_differentiability_info
+def inplace_or_view_method_registration(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> Optional[str]:
+    f = fn.func
+    if get_view_info(f) is None and (
+        not modifies_arguments(f) or len(f.func.returns) == 0
+    ):
+        return None
+    return WRAPPER_REGISTRATION.substitute(
+        unqual_operator_name_with_overload=f.func.name,
+        type_wrapper_name=type_wrapper_name(f),
+        class_type="ADInplaceOrView",
+    )
+
+
+def use_derived(fn: NativeFunctionWithDifferentiabilityInfo) -> bool:
+    f = fn.func
+    name = cpp.name(f.func)
+    return name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == "use_derived"
+
+
+def gen_inplace_or_view_type_env(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> Dict[str, List[str]]:
+    definition = inplace_or_view_method_definition(fn)
+    registration = inplace_or_view_method_registration(fn)
+
+    return {
+        "ops_headers": (
+            [f"#include <ATen/ops/{fn.func.root_name}_ops.h>"]
+            if definition is not None
+            else []
+        ),
+        "inplace_or_view_method_definitions": [definition]
+        if definition is not None
+        else [],
+        "inplace_or_view_wrapper_registrations": [registration]
+        if registration is not None
+        else [],
+    }
+
+
+def gen_inplace_or_view_type(
+    out: str,
+    native_yaml_path: str,
+    tags_yaml_path: str,
+    fns_with_infos: List[NativeFunctionWithDifferentiabilityInfo],
+    template_path: str,
+) -> None:
+    # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
+    # template regarding sharding of the generated files.
+    num_shards = 2
+
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_sharded(
+        "ADInplaceOrViewType.cpp",
+        [fn for fn in fns_with_infos if use_derived(fn)],
+        key_fn=lambda fn: fn.func.root_name,
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/ADInplaceOrViewType.cpp",
+        },
+        env_callable=gen_inplace_or_view_type_env,
+        num_shards=2,
+        sharded_keys={
+            "ops_headers",
+            "inplace_or_view_method_definitions",
+            "inplace_or_view_wrapper_registrations",
+        },
+    )
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_python_functions.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_python_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dca202354ce1a5c3597a2a21b695e15084ef581
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_python_functions.py
@@ -0,0 +1,1396 @@
+# Generates Python bindings for ATen functions
+#
+# The bindings are generated as methods on python_variable or functions on the
+# torch._C._nn. torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._sparse
+# or torch._C._special objects.
+#
+
+# Code tries to stick to the following rules:
+#
+# - templates should be colocated with the functions that use them.
+#   no templates are currently shared between functions, but if that
+#   happens, maybe put the template with the first one
+#
+# - don't use environment dictionaries when calling template.substitute().
+#   pass named arguments directly for everything, otherwise it's much too
+#   hard to track what's actually being used and by who
+#
+# - colocate any new hacks/adjustments with existing ones of the same kind.
+#   ideally in a data structure rather than code if possible. See e.g.
+#   SCHEMA_DEFAULT_CONVERSION_HACKS, etc.
+#
+# - similarly, conversions from one format to another should ideally happen
+#   all at once in a single place.
+#
+# - no nontrivial nested functions. couple-liners are ok but please no more.
+#   especially avoid functions that read/write outer variables defined far away.
+#
+# - raise RuntimeError instead of asserting, and put as much
+#   information as is available into the message. I.e. no need to
+#   plumb in new params whose only purpose is to fill out an error
+#   message, but use what's there
+#
+
+import itertools
+import re
+from collections import defaultdict
+
+from typing import Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple
+
+import yaml
+from torchgen.api import cpp
+from torchgen.api.python import (
+    arg_parser_output_exprs,
+    cpp_dispatch_exprs,
+    cpp_dispatch_target,
+    dispatch_lambda_args,
+    dispatch_lambda_exprs,
+    dispatch_lambda_return_str,
+    has_tensor_options,
+    PythonSignature,
+    PythonSignatureDeprecated,
+    PythonSignatureGroup,
+    PythonSignatureNativeFunctionPair,
+    signature,
+    signature_from_schema,
+    structseq_fieldnames,
+)
+
+from torchgen.code_template import CodeTemplate
+from torchgen.context import with_native_function
+from torchgen.gen import cpp_string, parse_native_yaml, parse_tags_yaml
+from torchgen.model import (
+    Argument,
+    BaseOperatorName,
+    FunctionSchema,
+    NativeFunction,
+    SchemaKind,
+    Type,
+    Variant,
+)
+from torchgen.utils import FileManager, split_name_params
+from torchgen.yaml_utils import YamlLoader
+
+from .gen_inplace_or_view_type import is_tensor_list_type
+from .gen_trace_type import should_trace
+
+#
+# declarations blocklist
+# We skip codegen for these functions, for various reasons.
+# Future PRs will categorize this list and eliminate or hoist
+# them out of eager-only codegen.
+# See https://github.com/pytorch/pytorch/issues/30788
+#
+
+# These functions require manual Python bindings or are not exposed to Python
+_SKIP_PYTHON_BINDINGS = [
+    "alias",
+    "contiguous",
+    "is_cuda",
+    "is_sparse",
+    "is_sparse_csr",
+    "size",
+    "stride",
+    "sym_size",
+    "sym_stride",
+    "sym_storage_offset",
+    "sym_numel",
+    ".*_backward",
+    ".*_backward_(out|input|weight|bias)",
+    ".*_forward",
+    ".*_forward_out",
+    ".*_jvp",
+    "_unsafe_view",
+    "tensor",
+    "_?sparse_(coo|compressed|csr|csc|bsr|bsc)_tensor.*",
+    "_range.*",
+    "_sparse_add_out",
+    "_sparse_div.*",
+    "_sparse_mul.*",
+    "_sparse_sub.*",
+    "_sparse_dense_add_out",
+    "index",
+    "index_out",
+    "unique_dim_consecutive",
+    "_cumsum.*",
+    "_cumprod.*",
+    "_sum.*",
+    "_prod.*",
+    "_th_.*",
+    "_thnn_.*",
+    "range.*",
+    "_solve.*",
+    "_inverse.*",
+    "_cholesky.*",
+    "_triangular_solve.*",
+    "_qr.*",
+    "_svd.*",
+    "slice",
+    "item",
+    "_local_scalar_dense",
+    "to",
+    "_to_copy",
+    "_to_copy_out",
+    "_reshape_copy",
+    "_reshape_copy_out",
+    "copy_sparse_to_sparse_",
+    "copy_",
+    "numpy_T",
+    "matrix_H",
+    "mT",
+    "mH",  # these need to be an attributes in Python, not functions
+    "nonzero(_(out|numpy))?",
+    "set_data",
+    ".*_overrideable",  # overrideable functions for backend extension
+    "data",
+    "is_leaf",
+    "output_nr",
+    "_version",
+    "requires_grad_",
+    "retains_grad",
+    "set_",
+    "_fw_primal",
+    "fake_quantize_per_tensor_affine_cachemask",
+    "fake_quantize_per_channel_affine_cachemask",
+    "_new_zeros_with_same_feature_meta",
+    "_has_same_storage_numel",  # used for forward AD internals
+    "_reshape_alias",
+    "replace_",  # only used by the functionalization pass, doesn't need to be exposed to python
+    "copy",  # only used by the functionalization pass
+    "fill.Tensor",  # only used by the functionalization pass
+    "fill.Scalar",  # only used by the functionalization pass
+    "lift.*",
+    "normal_functional",  # only used by the functionalization pas
+    "nbytes",
+    "itemsize",
+]
+
+SKIP_PYTHON_BINDINGS = [
+    re.compile(rf"^{pattern}$") for pattern in _SKIP_PYTHON_BINDINGS
+]
+
+# These function signatures are not exposed to Python. Note that this signature
+# list does not support regex.
+SKIP_PYTHON_BINDINGS_SIGNATURES = [
+    "add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor",
+    "add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)",
+    "sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor",
+    "sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)",
+    "mul.Scalar(Tensor self, Scalar other) -> Tensor",
+    "mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)",
+    "div.Scalar(Tensor self, Scalar other) -> Tensor",
+    "div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)",
+]
+
+
+@with_native_function
+def should_generate_py_binding(f: NativeFunction) -> bool:
+    # NativeFunctions that are entirely code-generated should not get python bindings
+    # because these codegen implementations are often inefficient. A handful of
+    # view_copy style ops were exposed accidentally when they were handwritten and now
+    # that we are moving them to codegen for bc reasons we need to keep them exposed in
+    # python.
+    if "generated" in f.tags and "view_copy" not in f.tags:
+        return False
+
+    name = cpp.name(f.func)
+    for skip_regex in SKIP_PYTHON_BINDINGS:
+        if skip_regex.match(name):
+            return False
+
+    signature = str(f.func)
+    for pattern in SKIP_PYTHON_BINDINGS_SIGNATURES:
+        if pattern == signature:
+            return False
+    return True
+
+
+def get_pycname(name: BaseOperatorName) -> str:
+    return f"THPVariable_{name}"
+
+
+def is_noarg(overloads: Sequence[PythonSignatureNativeFunctionPair]) -> bool:
+    return len(overloads) == 1 and overloads[0].signature.arguments_count() == 0
+
+
+def is_py_variable_method(f: NativeFunction) -> bool:
+    return f.python_module is None and Variant.method in f.variants
+
+
+def is_py_torch_function(f: NativeFunction) -> bool:
+    return f.python_module is None and Variant.function in f.variants
+
+
+def is_py_nn_function(f: NativeFunction) -> bool:
+    return f.python_module == "nn"
+
+
+def is_py_fft_function(f: NativeFunction) -> bool:
+    return f.python_module == "fft"
+
+
+def is_py_linalg_function(f: NativeFunction) -> bool:
+    return f.python_module == "linalg"
+
+
+def is_py_nested_function(f: NativeFunction) -> bool:
+    return f.python_module == "nested"
+
+
+def is_py_sparse_function(f: NativeFunction) -> bool:
+    return f.python_module == "sparse"
+
+
+def is_py_special_function(f: NativeFunction) -> bool:
+    return f.python_module == "special"
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                            Main Function
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def gen(
+    out: str,
+    native_yaml_path: str,
+    tags_yaml_path: str,
+    deprecated_yaml_path: str,
+    template_path: str,
+    *,
+    symint: bool = True,
+) -> None:
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
+    native_functions = list(filter(should_generate_py_binding, native_functions))
+
+    methods = load_signatures(native_functions, deprecated_yaml_path, method=True)
+    create_python_bindings(
+        fm,
+        methods,
+        is_py_variable_method,
+        None,
+        "python_variable_methods.cpp",
+        method=True,
+        symint=symint,
+    )
+
+    # NOTE: num_shards here must be synced with gatherTorchFunctions in
+    #       torch/csrc/autograd/python_torch_functions_manual.cpp
+    functions = load_signatures(native_functions, deprecated_yaml_path, method=False)
+    create_python_bindings_sharded(
+        fm,
+        functions,
+        is_py_torch_function,
+        "torch",
+        "python_torch_functions.cpp",
+        method=False,
+        num_shards=3,
+        symint=symint,
+    )
+
+    create_python_bindings(
+        fm,
+        functions,
+        is_py_nn_function,
+        "torch.nn",
+        "python_nn_functions.cpp",
+        method=False,
+        symint=symint,
+    )
+
+    create_python_bindings(
+        fm,
+        functions,
+        is_py_fft_function,
+        "torch.fft",
+        "python_fft_functions.cpp",
+        method=False,
+        symint=symint,
+    )
+
+    create_python_bindings(
+        fm,
+        functions,
+        is_py_linalg_function,
+        "torch.linalg",
+        "python_linalg_functions.cpp",
+        method=False,
+        symint=symint,
+    )
+
+    create_python_bindings(
+        fm,
+        functions,
+        is_py_nested_function,
+        "torch.nested",
+        "python_nested_functions.cpp",
+        method=False,
+    )
+
+    create_python_bindings(
+        fm,
+        functions,
+        is_py_sparse_function,
+        "torch.sparse",
+        "python_sparse_functions.cpp",
+        method=False,
+        symint=symint,
+    )
+
+    create_python_bindings(
+        fm,
+        functions,
+        is_py_special_function,
+        "torch.special",
+        "python_special_functions.cpp",
+        method=False,
+        symint=symint,
+    )
+
+    # Currently, we only use `functions` to generate `return_types` bindings.
+    # All methods which return structseq have function variant at this point.
+    # If any method only operator with structseq is added in the future,
+    # we will have to address that.
+    create_python_return_type_bindings(
+        fm, functions, lambda fn: True, "python_return_types.cpp"
+    )
+    create_python_return_type_bindings_header(
+        fm, functions, lambda fn: True, "python_return_types.h"
+    )
+
+    valid_tags = parse_tags_yaml(tags_yaml_path)
+
+    def gen_tags_enum() -> Dict[str, str]:
+        return {
+            "enum_of_valid_tags": (
+                "".join(
+                    [f'\n.value("{tag}", at::Tag::{tag})' for tag in sorted(valid_tags)]
+                )
+            )
+        }
+
+    fm.write("python_enum_tag.cpp", gen_tags_enum)
+
+
+def group_filter_overloads(
+    pairs: Sequence[PythonSignatureNativeFunctionPair],
+    pred: Callable[[NativeFunction], bool],
+) -> Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]:
+    grouped: Dict[
+        BaseOperatorName, List[PythonSignatureNativeFunctionPair]
+    ] = defaultdict(list)
+    for pair in pairs:
+        if pred(pair.function):
+            grouped[pair.function.func.name.name].append(pair)
+    return grouped
+
+
+def create_python_bindings(
+    fm: FileManager,
+    pairs: Sequence[PythonSignatureNativeFunctionPair],
+    pred: Callable[[NativeFunction], bool],
+    module: Optional[str],
+    filename: str,
+    *,
+    method: bool,
+    symint: bool = True,
+) -> None:
+    """Generates Python bindings to ATen functions"""
+    py_methods: List[str] = []
+    ops_headers: List[str] = []
+    py_method_defs: List[str] = []
+    py_forwards: List[str] = []
+
+    grouped = group_filter_overloads(pairs, pred)
+
+    for name in sorted(grouped.keys(), key=str):
+        overloads = grouped[name]
+        py_methods.append(
+            method_impl(name, module, overloads, method=method, symint=symint)
+        )
+        py_method_defs.append(method_def(name, module, overloads, method=method))
+        py_forwards.extend(forward_decls(name, overloads, method=method))
+        ops_headers.append(f"#include <ATen/ops/{name.base}.h>")
+
+    fm.write_with_template(
+        filename,
+        filename,
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/{filename}",
+            "ops_headers": ops_headers,
+            "py_forwards": py_forwards,
+            "py_methods": py_methods,
+            "py_method_defs": py_method_defs,
+        },
+    )
+
+
+def create_python_return_type_bindings(
+    fm: FileManager,
+    pairs: Sequence[PythonSignatureNativeFunctionPair],
+    pred: Callable[[NativeFunction], bool],
+    filename: str,
+) -> None:
+    """
+    Generate function to initialize and return named tuple for native functions
+    which returns named tuple and registration invocations in `python_return_types.cpp`.
+    """
+    py_return_types_definition: List[str] = []
+    py_return_types_registrations: List[str] = []
+
+    grouped = group_filter_overloads(pairs, pred)
+
+    for name in sorted(grouped.keys(), key=str):
+        overloads = grouped[name]
+        definitions, registrations = generate_return_type_definition_and_registrations(
+            overloads
+        )
+        py_return_types_definition.append(
+            "" if not definitions else "\n".join(definitions)
+        )
+        py_return_types_registrations.append(
+            "" if not registrations else "\n".join(registrations)
+        )
+
+    fm.write_with_template(
+        filename,
+        filename,
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/{filename}",
+            "py_return_types": py_return_types_definition,
+            "py_return_types_registrations": py_return_types_registrations,
+        },
+    )
+
+
+def create_python_return_type_bindings_header(
+    fm: FileManager,
+    pairs: Sequence[PythonSignatureNativeFunctionPair],
+    pred: Callable[[NativeFunction], bool],
+    filename: str,
+) -> None:
+    """
+    Generate function to initialize and return named tuple for native functions
+    which returns named tuple and relevant entry for the map in `python_return_types.cpp`.
+    """
+    py_return_types_declarations: List[str] = []
+
+    grouped = group_filter_overloads(pairs, pred)
+
+    for name in sorted(grouped.keys(), key=str):
+        overloads = grouped[name]
+        declarations = generate_return_type_declarations(overloads)
+        py_return_types_declarations.append(
+            "" if not declarations else "\n".join(declarations)
+        )
+
+    fm.write_with_template(
+        filename,
+        filename,
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/{filename}",
+            "py_return_types_declarations": py_return_types_declarations,
+        },
+    )
+
+
+def create_python_bindings_sharded(
+    fm: FileManager,
+    pairs: Sequence[PythonSignatureNativeFunctionPair],
+    pred: Callable[[NativeFunction], bool],
+    module: Optional[str],
+    filename: str,
+    *,
+    method: bool,
+    num_shards: int,
+    symint: bool = True,
+) -> None:
+    """Generates Python bindings to ATen functions"""
+    grouped = group_filter_overloads(pairs, pred)
+
+    def key_func(
+        kv: Tuple[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]
+    ) -> str:
+        return kv[0].base
+
+    def env_func(
+        kv: Tuple[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]
+    ) -> Dict[str, List[str]]:
+        name, fn_pairs = kv
+        return {
+            "ops_headers": [f"#include <ATen/ops/{name.base}.h>"],
+            "py_forwards": list(forward_decls(name, fn_pairs, method=method)),
+            "py_methods": [
+                method_impl(name, module, fn_pairs, method=method, symint=symint)
+            ],
+            "py_method_defs": [method_def(name, module, fn_pairs, method=method)],
+        }
+
+    fm.write_sharded(
+        filename,
+        grouped.items(),
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/{filename}",
+        },
+        key_fn=key_func,
+        env_callable=env_func,
+        num_shards=num_shards,
+        sharded_keys={"ops_headers", "py_forwards", "py_methods", "py_method_defs"},
+    )
+
+
+def load_signatures(
+    native_functions: List[NativeFunction],
+    deprecated_yaml_path: str,
+    *,
+    method: bool,
+    skip_deprecated: bool = False,
+    pyi: bool = False,
+) -> Sequence[PythonSignatureNativeFunctionPair]:
+    @with_native_function
+    def gen_signature_pairs(f: NativeFunction) -> PythonSignatureNativeFunctionPair:
+        return PythonSignatureNativeFunctionPair(
+            signature=signature(f, method=method, pyi=pyi),
+            function=f,
+        )
+
+    pairs = list(map(gen_signature_pairs, native_functions))
+    deprecated = load_deprecated_signatures(
+        pairs, deprecated_yaml_path, method=method, pyi=pyi
+    )
+    return pairs if skip_deprecated else pairs + deprecated
+
+
+def load_deprecated_signatures(
+    pairs: Sequence[PythonSignatureNativeFunctionPair],
+    deprecated_yaml_path: str,
+    *,
+    method: bool,
+    pyi: bool,
+) -> List[PythonSignatureNativeFunctionPair]:
+    # The deprecated.yaml doesn't have complete type information, we need
+    # find and leverage the original ATen signature (to which it delegates
+    # the call) to generate the full python signature.
+    # We join the deprecated and the original signatures using type-only form.
+
+    # group the original ATen signatures by name
+    grouped: Dict[str, List[PythonSignatureNativeFunctionPair]] = defaultdict(list)
+    for pair in pairs:
+        grouped[pair.signature.name].append(pair)
+
+    # find matching original signatures for each deprecated signature
+    results: List[PythonSignatureNativeFunctionPair] = []
+
+    with open(deprecated_yaml_path) as f:
+        deprecated_defs = yaml.load(f, Loader=YamlLoader)
+
+    for deprecated in deprecated_defs:
+        schema = FunctionSchema.parse(deprecated["name"])
+        aten_name, call_args = split_name_params(deprecated["aten"])
+        is_out = aten_name.endswith("_out")
+        if is_out:
+            aten_name = aten_name.replace("_out", "")
+
+        # HACK: these are fixed constants used to pass the aten function.
+        # The type must be known ahead of time
+        known_constants = {
+            "1": Type.parse("Scalar"),
+        }
+        schema_args_by_name = {a.name: a for a in schema.arguments.flat_all}
+        for name in call_args:
+            assert (
+                name in schema_args_by_name or name in known_constants
+            ), f"deprecation definiton: Unrecognized value {name}"
+
+        # Map deprecated signature arguments to their aten signature and test
+        # if the types and alias annotation match.
+        def is_schema_compatible(
+            aten_schema: FunctionSchema,
+        ) -> bool:
+            arguments: Iterable[Argument]
+            if is_out:
+                arguments = itertools.chain(
+                    aten_schema.arguments.out, aten_schema.arguments.flat_non_out
+                )
+            else:
+                arguments = aten_schema.arguments.flat_all
+
+            for i, arg in enumerate(arguments):
+                if i < len(call_args):
+                    arg_name = call_args[i]
+                    if arg_name in known_constants:
+                        schema_type = known_constants[arg_name]
+                        schema_annotation = None
+                    else:
+                        schema_arg = schema_args_by_name[arg_name]
+                        schema_type = schema_arg.type
+                        schema_annotation = schema_arg.annotation
+
+                    if schema_type != arg.type or schema_annotation != arg.annotation:
+                        return False
+                else:
+                    if arg.default is None:
+                        return False
+
+            return len(schema.returns) == len(aten_schema.returns) and all(
+                a == b for a, b in zip(schema.returns, aten_schema.returns)
+            )
+
+        any_schema_found = False
+        for pair in grouped[aten_name]:
+            if not is_schema_compatible(pair.function.func):
+                continue
+            any_schema_found = True
+
+            python_sig = signature_from_schema(
+                schema,
+                category_override=pair.function.category_override,
+                method=method,
+                pyi=pyi,
+            )
+
+            results.append(
+                PythonSignatureNativeFunctionPair(
+                    signature=PythonSignatureDeprecated(
+                        name=python_sig.name,
+                        input_args=python_sig.input_args,
+                        input_kwargs=python_sig.input_kwargs,
+                        output_args=python_sig.output_args,
+                        tensor_options_args=python_sig.tensor_options_args,
+                        method=python_sig.method,
+                        deprecated_schema=schema,
+                        deprecated_args_exprs=tuple(call_args),
+                        returns=python_sig.returns,
+                    ),
+                    function=pair.function,
+                )
+            )
+        assert (
+            any_schema_found
+        ), f"No native function with name {aten_name} matched signature:\n  {str(schema)}"
+
+    return results
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                         Named Tuple Codegen
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+@with_native_function
+def gen_structseq_typename_key(f: NativeFunction) -> str:
+    name = cpp.name(f.func)
+    fieldnames = structseq_fieldnames(f.func.returns)
+    return "_".join([name] + fieldnames)
+
+
+def emit_structseq_call(
+    overloads: Sequence[PythonSignatureNativeFunctionPair],
+) -> Tuple[List[str], Dict[str, str]]:
+    """
+    Generate block of named tuple type def inits, and add typeref snippets
+    to declarations that use them
+    """
+    typenames: Dict[
+        str, str
+    ] = {}  # map from unique name + field name lists to typedef name
+    typedefs: List[str] = []  # typedef declarations and init code
+
+    for overload in overloads:
+        fieldnames = structseq_fieldnames(overload.function.func.returns)
+        if not fieldnames:
+            continue
+
+        name = cpp.name(overload.function.func)  # use @with_native_function?
+        tn_key = gen_structseq_typename_key(overload.function)
+        typename = typenames.get(tn_key)
+        if typename is None:
+            typename = f'NamedTuple{"" if not typedefs else len(typedefs)}'
+            typenames[tn_key] = typename
+            typedefs.append(
+                f"""\
+static PyTypeObject* {typename} = generated::get_{name}_structseq();"""
+            )
+
+    return typedefs, typenames
+
+
+def generate_return_type_definition_and_registrations(
+    overloads: Sequence[PythonSignatureNativeFunctionPair],
+) -> Tuple[List[str], List[str]]:
+    """
+    Generate block of function in `python_return_types.cpp` to initialize
+    and return named tuple for a native function which returns named tuple
+    and registration invocations in same file.
+    """
+    typenames: Dict[
+        str, str
+    ] = {}  # map from unique name + field name lists to typedef name
+    definitions: List[str] = []  # function definition to register the typedef
+    registrations: List[str] = []  # register call for the typedef
+
+    for overload in overloads:
+        fieldnames = structseq_fieldnames(overload.function.func.returns)
+        if not fieldnames:
+            continue
+
+        fields = ", ".join(f'{{"{fn}", ""}}' for fn in fieldnames)
+
+        name = cpp.name(overload.function.func)  # use @with_native_function?
+        tn_key = gen_structseq_typename_key(overload.function)
+        typename = typenames.get(tn_key)
+
+        if typename is None:
+            typename = f'{name}NamedTuple{"" if not definitions else len(definitions)}'
+            typenames[tn_key] = typename
+            definitions.append(
+                f"""\
+PyTypeObject* get_{name}_structseq() {{
+    static PyStructSequence_Field NamedTuple_fields[] = {{ {fields},  {{nullptr}} }};
+    static PyTypeObject {typename};
+    static bool is_initialized = false;
+    static PyStructSequence_Desc desc = {{ "torch.return_types.{name}", nullptr, NamedTuple_fields, {len(fieldnames)} }};
+    if (!is_initialized) {{
+        PyStructSequence_InitType(&{typename}, &desc);
+        {typename}.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
+        is_initialized = true;
+    }}
+    return &{typename};
+}}
+"""
+            )
+            registrations.append(
+                f'addReturnType(return_types_module, "{name}", generated::get_{name}_structseq());'
+            )
+
+    return definitions, registrations
+
+
+def generate_return_type_declarations(
+    overloads: Sequence[PythonSignatureNativeFunctionPair],
+) -> List[str]:
+    """
+    Generate block of function declarations in `python_return_types.h` to initialize
+    and return named tuple for a native function.
+    """
+    typenames: Dict[
+        str, str
+    ] = {}  # map from unique name + field name lists to typedef name
+    declarations: List[str] = []  # function declaration to register the typedef
+
+    for overload in overloads:
+        fieldnames = structseq_fieldnames(overload.function.func.returns)
+        if not fieldnames:
+            continue
+
+        name = cpp.name(overload.function.func)  # use @with_native_function?
+        tn_key = gen_structseq_typename_key(overload.function)
+        typename = typenames.get(tn_key)
+
+        if typename is None:
+            typename = (
+                f'{name}NamedTuple{"" if not declarations else len(declarations)}'
+            )
+            typenames[tn_key] = typename
+            declarations.append(f"PyTypeObject* get_{name}_structseq();")
+
+    return declarations
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                         Method Impl Codegen
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+# python binding for all overloads of a particular function/method
+PY_VARIABLE_METHOD_VARARGS = CodeTemplate(
+    r"""\
+// ${name}
+static PyObject * ${pycname}(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  ${method_header}
+  static PythonArgParser parser({
+    ${signatures}
+  }, /*traceable=*/${traceable});
+
+  ParsedArgs<${max_args}> parsed_args;
+  auto _r = parser.parse(${self_}, args, kwargs, parsed_args);
+  ${check_has_torch_function}
+  switch (_r.idx) {
+    ${dispatch}
+  }
+  ${method_footer}
+}
+
+"""
+)
+
+# handler for a single parsed signature - may be a single overload or
+# a pair of overloads that whose signatures only differ in output params
+# (plugged into PY_VARIABLE_METHOD_VARARGS as an item in ${dispatch})
+PY_VARIABLE_CASE = CodeTemplate(
+    """\
+case ${overload_index}: {
+  ${body}
+}
+"""
+)
+
+# python binding for single-overload function/method
+PY_VARIABLE_METHOD_VARARGS_SINGLETON = CodeTemplate(
+    """\
+// ${name}
+static PyObject * ${pycname}(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  ${method_header}
+  static PythonArgParser parser({
+    ${signatures}
+  }, /*traceable=*/${traceable});
+
+  ParsedArgs<${max_args}> parsed_args;
+  auto _r = parser.parse(${self_}, args, kwargs, parsed_args);
+  ${check_has_torch_function}
+  ${dispatch}
+  ${method_footer}
+}
+
+"""
+)
+
+# python binding for a method with no args, shortcuts parsing
+PY_VARIABLE_METHOD_NOARGS = CodeTemplate(
+    """\
+// ${name}
+static PyObject * ${pycname}(PyObject* self_, PyObject* args)
+{
+  ${method_header}
+  ${check_has_torch_function}
+  ${dispatch}
+  ${method_footer}
+}
+
+"""
+)
+
+
+def method_impl(
+    name: BaseOperatorName,
+    module: Optional[str],
+    overloads: Sequence[PythonSignatureNativeFunctionPair],
+    *,
+    method: bool,
+    symint: bool = True,
+) -> str:
+    """
+    Generate a python binding for all overloads of an op.
+    """
+    pycname = get_pycname(name)
+    noarg = is_noarg(overloads)
+    structseq_inits, structseq_typenames = emit_structseq_call(overloads)
+
+    method_header = ["HANDLE_TH_ERRORS"]
+    method_header += structseq_inits
+    method_header += (
+        ["const Tensor& self = THPVariable_Unpack(self_);"] if method else []
+    )
+
+    method_footer = ([] if noarg else ["Py_RETURN_NONE;"]) + ["END_HANDLE_TH_ERRORS"]
+
+    traceable = "true" if all(should_trace(o.function) for o in overloads) else "false"
+
+    grouped_overloads: Sequence[PythonSignatureGroup] = group_overloads(
+        overloads, symint=symint
+    )
+    is_singleton = len(grouped_overloads) == 1
+    signatures: List[str] = []
+    dispatch: List[str] = []
+    for overload_index, overload in enumerate(grouped_overloads):
+        signature = overload.signature.signature_str(symint=symint)
+        signatures.append(f"{cpp_string(str(signature))},")
+        dispatch_body = emit_dispatch_case(overload, structseq_typenames, symint=symint)
+        dispatch.append(
+            PY_VARIABLE_CASE.substitute(
+                overload_index=overload_index, body=dispatch_body
+            )
+            if not is_singleton
+            else dispatch_body
+        )
+
+    if noarg:
+        template = PY_VARIABLE_METHOD_NOARGS
+    elif is_singleton:
+        template = PY_VARIABLE_METHOD_VARARGS_SINGLETON
+    else:
+        template = PY_VARIABLE_METHOD_VARARGS
+
+    return template.substitute(
+        name=name,
+        pycname=pycname,
+        method_header=method_header,
+        max_args=max(o.signature.arguments_count() for o in overloads),
+        signatures=signatures,
+        traceable=traceable,
+        check_has_torch_function=gen_has_torch_function_check(
+            name=name,
+            module=module,
+            noarg=noarg,
+            method=method,
+        ),
+        dispatch=dispatch,
+        method_footer=method_footer,
+        self_="self_" if method else "nullptr",
+    )
+
+
+def gen_has_torch_function_check(
+    name: BaseOperatorName, module: Optional[str], *, noarg: bool, method: bool
+) -> str:
+    if noarg:
+        if method:
+            return f"""\
+if(check_has_torch_function(self_)) {{
+  return handle_torch_function(self_, "{name}");
+}}
+"""
+        else:
+            return ""
+
+    self_ = "self_" if method else "nullptr"
+    namespace = (
+        {
+            "torch": "THPVariableFunctionsModule",
+            "torch.nn": "THPNNVariableFunctionsModule",
+            "torch.fft": "THPFFTVariableFunctionsModule",
+            "torch.linalg": "THPLinalgVariableFunctionsModule",
+            "torch.nested": "THPNestedVariableFunctionsModule",
+            "torch.sparse": "THPSparseVariableFunctionsModule",
+            "torch.special": "THPSpecialVariableFunctionsModule",
+        }[module]
+        if module
+        else "THPVariableClass"
+    )
+
+    return f"""\
+if(_r.has_torch_function()) {{
+  return handle_torch_function(_r, {self_}, args, kwargs, {namespace}, "{module or "torch.Tensor"}");
+}}
+"""
+
+
+# handler for output/no-output overload pair
+PY_VARIABLE_OUT = CodeTemplate(
+    """\
+if (_r.isNone(${out_idx})) {
+  ${call_dispatch}
+} else {
+  ${call_dispatch_out}
+}
+"""
+)
+
+
+def emit_dispatch_case(
+    overload: PythonSignatureGroup,
+    structseq_typenames: Dict[str, str],
+    *,
+    symint: bool = True,
+) -> str:
+    """
+    Emit dispatch code for a single parsed signature. This corresponds to either
+    a single native function, or a pair that differ only in output params. In the
+    latter case, a single python signature is used for both and dispatching
+    switches on the presence/absence of passed output args.
+    """
+    if overload.outplace is not None:
+        # dispatch output and no-output variants, branch on _r.isNone(<out_idx>)
+        return PY_VARIABLE_OUT.substitute(
+            out_idx=overload.signature.output_idx(),
+            call_dispatch=emit_single_dispatch(
+                overload.signature, overload.base, structseq_typenames, symint=symint
+            ),
+            call_dispatch_out=emit_single_dispatch(
+                overload.signature,
+                overload.outplace,
+                structseq_typenames,
+                symint=symint,
+            ),
+        )
+    else:
+        # no-output version only
+        return emit_single_dispatch(
+            overload.signature, overload.base, structseq_typenames, symint=symint
+        )
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                    Forward Declarations Codegen
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def forward_decls(
+    name: BaseOperatorName,
+    overloads: Sequence[PythonSignatureNativeFunctionPair],
+    *,
+    method: bool,
+) -> Tuple[str, ...]:
+    if method:
+        return ()
+
+    pycname = get_pycname(name)
+    if is_noarg(overloads):
+        return (
+            f"""\
+static PyObject * {pycname}(PyObject* self_, PyObject* args);
+""",
+        )
+    else:
+        return (
+            f"""\
+static PyObject * {pycname}(PyObject* self_, PyObject* args, PyObject* kwargs);
+""",
+        )
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#              Method Def (Binding Table Entry) Codegen
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def method_def(
+    name: BaseOperatorName,
+    module: Optional[str],
+    overloads: Sequence[PythonSignatureNativeFunctionPair],
+    *,
+    method: bool,
+) -> str:
+    """
+    Generate method def entry.
+    """
+    pycname = get_pycname(name)
+
+    if name.dunder_method:
+        # PyMethodDef entry for binary op, throws not implemented error
+        pycname = f"TypeError_to_NotImplemented_<{pycname}>"
+
+    if is_noarg(overloads):
+        flags = "METH_NOARGS" if method else "METH_VARARGS | METH_KEYWORDS"
+    else:
+        pycname = f"castPyCFunctionWithKeywords({pycname})"
+        flags = "METH_VARARGS | METH_KEYWORDS"
+
+    if module == "torch":
+        flags += " | METH_STATIC"
+
+    return f'{{"{name}", {pycname}, {flags}, NULL}},'
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                   Overload Sorting and Grouping
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def group_overloads(
+    overloads: Sequence[PythonSignatureNativeFunctionPair], *, symint: bool = True
+) -> Sequence[PythonSignatureGroup]:
+    bases: Dict[str, PythonSignatureNativeFunctionPair] = {}
+    outplaces: Dict[str, PythonSignatureNativeFunctionPair] = {}
+
+    # first group by signature ignoring out arguments
+    for overload in overloads:
+        sig = overload.signature.signature_str(skip_outputs=True, symint=symint)
+        if overload.function.func.is_out_fn():
+            if sig in outplaces:
+                raise RuntimeError(
+                    f"Found duplicated function definition:\n- {overload.function.func}.\n"
+                    f"Existing definition:\n- {outplaces[sig].function.func}."
+                )
+            outplaces[sig] = overload
+        else:
+            if sig in bases:
+                raise RuntimeError(
+                    f"Found duplicated function definition:\n- {overload.function.func}.\n"
+                    f"Existing definition:\n- {bases[sig].function.func}."
+                )
+            bases[sig] = overload
+
+    for sig, out in outplaces.items():
+        if sig not in bases:
+            candidates: List[str] = []
+            for overload in overloads:
+                if (
+                    str(overload.function.func.name.name)
+                    == str(out.function.func.name.name)
+                    and not overload.function.func.is_out_fn()
+                    and not overload.signature.deprecated
+                ):
+                    candidates.append(
+                        overload.signature.signature_str(
+                            skip_outputs=True, symint=symint
+                        )
+                    )
+            out_sig = out.signature.signature_str(symint=symint)
+            raise RuntimeError(
+                f"While identifying overloads, we found an out schema {out_sig} without a corresponding non-out variant. "
+                f"We expected the non-out variant to have schema: \n- {sig}\nPlease check that you spelled the schema "
+                "correctly in native_functions.yaml. We discovered the following candidate(s): \n"
+                + "\n".join(f"- {candidate}" for candidate in candidates)
+            )
+
+    grouped = [
+        PythonSignatureGroup.from_pairs(
+            functional=base,
+            out=outplaces.get(sig),
+        )
+        for sig, base in bases.items()
+    ]
+    return sort_overloads(grouped, symint=symint)
+
+
+# This function declares a partial order on declarations, and sorts them according
+# to its linear extension. This is necessary, because there's some ambiguity in the
+# choice of overload, and we want a different order.
+#
+# See Note[Order of overloads matters]
+#
+# A few examples of ambiguous python signature pairs.
+#
+#   All parameters have the same type, except one taking Tensor the other taking
+#   Scalar. A numeric PyObject can be casted into Tensor, and a zero-dim Tensor
+#   object can be accepted as Scalar type parameter (see python_arg_parser.cpp).
+#   Therefore, same input arguments might be accepted by either python signature.
+#   We want to always parse the one taking Tensor first.
+#
+#     bitwise_and(Tensor input, Tensor other, *, Tensor out=None)
+#     bitwise_and(Tensor input, Scalar other, *, Tensor out=None)
+#
+#   If they have different number of parameters then they are not ambiguous - but
+#   the difference on output param can be ignored as it's optional.
+#
+#     multiply(Tensor input, Tensor other, *, Tensor out=None)
+#     multiply(Tensor input, Scalar other)
+#
+#   Both positional args and keyword-only args are considered together.
+#
+#     subtract(Tensor other, *, Scalar alpha=1)
+#     subtract(Scalar other, Scalar alpha=1)
+#
+# A few ambiguous cases which it does NOT handle yet.
+#
+#   If there is any difference in other parameters besides the Tensor/Scalar
+#   difference, then they are not considered ambiguous by this method anymore.
+#   However, the difference could be too trivial to disambiguate.
+#
+#     foo(Tensor input, Scalar other, Scalar bar)
+#     foo(Tensor input, Tensor other, double bar)
+#
+#   If they are taking different number of parameters then they are not considered
+#   ambiguous anymore, even if the difference is only on optional kwargs.
+#
+#     foo(Scalar other, Scalar alpha=1)
+#     foo(Tensor other, *, Scalar alpha=1, Scalar beta=1)
+#
+
+
+def sort_overloads(
+    grouped_overloads: Sequence[PythonSignatureGroup], *, symint: bool = True
+) -> Sequence[PythonSignatureGroup]:
+    # NB: Smaller here means lower priority
+
+    def is_arg_smaller(t1: Type, t2: Type) -> bool:
+        return (
+            str(t1) == "Scalar"
+            and str(t2) == "Tensor"
+            or str(t1) == "Scalar?"
+            and str(t2) == "Tensor?"
+            or "Dimname" in str(t1)
+            and "Dimname" not in str(t2)
+            or
+            # In the discussion https://github.com/pytorch/pytorch/issues/54555 it has been
+            # discussed why it is important to prioritize int/int? over int[]
+            str(t1) == "int[]"
+            and (str(t2) == "int" or str(t2) == "int?")
+            or
+            # TensorList currently throws an error during argument parsing, that's why it needs to be
+            # last in signature ordering. See discussion: https://github.com/pytorch/pytorch/issues/58087
+            str(t1) == "Tensor[]"
+            and str(t2).find("[]") != -1
+            or
+            # Prioritize IntArrayRef overload over SymIntArrayRef
+            str(t1) == "SymInt[]"
+            and str(t2) == "int[]"
+            or
+            # Make sure both in, SymInt are sorted consistently w.r.t. Tensor since Tensor can be implicitly
+            # converted to either int or SymInt.  Prioritize the Tensor overload since it otherwise gets shadowed.
+            (str(t1) == "SymInt" or str(t1) == "int")
+            and str(t2) == "Tensor"
+        )
+
+    def is_smaller(s1: PythonSignature, s2: PythonSignature) -> bool:
+        """Returns True if s1 < s2 in the partial order."""
+        args1, args2 = s1.arguments(skip_outputs=True), s2.arguments(skip_outputs=True)
+        if len(args1) != len(args2):
+            return False
+        # TODO: should use some canonical form instead of 'str(arg.type)' - see comments
+        # above. The old codegen used the deprecated 'dynamic_type(arg.type)', which
+        # ignores the optional annotation, i.e. 'Scalar' and 'Scalar?'.
+        equal = all(arg1.type == arg2.type for arg1, arg2 in zip(args1, args2))
+        smaller_or_equal = all(
+            str(arg1.type) == str(arg2.type) or is_arg_smaller(arg1.type, arg2.type)
+            for arg1, arg2 in zip(args1, args2)
+        )
+        return smaller_or_equal and not equal
+
+    # First sort by signature
+    grouped_overloads = sorted(
+        grouped_overloads, key=lambda x: x.signature.signature_str(symint=symint)
+    )
+
+    # Construct the relation graph
+    larger_than: Dict[int, Set[int]] = defaultdict(set)
+    for i1, overload1 in enumerate(grouped_overloads):
+        for i2, overload2 in enumerate(grouped_overloads):
+            if is_smaller(overload1.signature, overload2.signature):
+                larger_than[i1].add(i2)
+
+    if not larger_than:
+        return list(grouped_overloads)
+
+    # Use a topological sort to sort overloads according to the partial order.
+    N = len(grouped_overloads)
+    sorted_ids: List[int] = list(filter(lambda x: x not in larger_than, range(N)))
+
+    for idx in range(N):
+        # The size of sorted_ids will grow to N eventually.
+        i = sorted_ids[idx]
+        for j in sorted(larger_than.keys()):
+            larger = larger_than[j]
+            larger.discard(i)
+            if not larger:
+                del larger_than[j]
+                sorted_ids.append(j)
+
+    return [grouped_overloads[x] for x in sorted_ids]
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                       Codegen API Integration
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def emit_single_dispatch(
+    ps: PythonSignature,
+    f: NativeFunction,
+    structseq_typenames: Dict[str, str],
+    *,
+    symint: bool = True,
+) -> str:
+    """
+    Emit dispatch code for a single native function.
+    """
+
+    @with_native_function
+    def go(f: NativeFunction) -> str:
+        # header comments
+        if isinstance(ps, PythonSignatureDeprecated):
+            schema_comment = f"// [deprecated] aten::{ps.deprecated_schema}"
+        else:
+            schema_comment = f"// aten::{f.func}"
+
+        deprecated = "[deprecated] " if ps.deprecated else ""
+
+        # dispatch lambda signature
+        name = cpp.name(f.func)
+        lambda_formals = ", ".join(
+            f"{a.type_str} {a.name}" for a in dispatch_lambda_args(ps, f, symint=symint)
+        )
+        lambda_return = dispatch_lambda_return_str(f)
+
+        # dispatch lambda body
+        dispatch_callee = cpp_dispatch_target(f)
+        dispatch_args = ", ".join(cpp_dispatch_exprs(f, python_signature=ps))
+
+        # from arg parser outputs to dispatch lambda arguments
+        parser_outputs = arg_parser_output_exprs(ps, f, symint=symint)
+        lambda_arg_exprs = dispatch_lambda_exprs(ps, f, symint=symint)
+        inits = "\n".join(lambda_arg_exprs.inits)
+        lambda_args = ", ".join(lambda_arg_exprs.exprs)
+
+        # scatter fields
+        # TODO: Checking `ps.method and ('requires_grad' in parser_outputs)` is a hacky
+        #       solution for enabling the 'requires_grad' argument for tensor methods
+        #       new_full, new_empty, and new_zeros. A much better but more difficult to
+        #       implement solution involves refactoring according to Ed's description here:
+        #       https://github.com/pytorch/pytorch/issues/36455#issuecomment-614767589
+        need_set_requires_grad = ps.tensor_options_args and (
+            not has_tensor_options(f)
+            or (ps.method and ("requires_grad" in parser_outputs))
+        )
+        set_requires_grad = (
+            f'.set_requires_grad({parser_outputs["requires_grad"].expr})'
+            if need_set_requires_grad
+            else ""
+        )
+
+        if lambda_return == "void":
+            # Make in-place foreach return `self` at python-binding level.
+            # ref: https://github.com/pytorch/pytorch/pull/118622#pullrequestreview-1904804954
+            self_arg = f.func.arguments.self_arg
+            return_stmt: str
+            if (
+                str(f.func.name).startswith("_foreach_")
+                and f.func.kind() == SchemaKind.inplace
+            ):
+                # note(crcrpar): `_foreach_pow.ScalarAndTensor` does NOT have its in-place
+                # variant and it unlikely to have it in the future. Thus it's safe to have the following assert.
+                assert self_arg is not None and is_tensor_list_type(
+                    self_arg.argument.type
+                )
+                return_stmt = """PyObject* self_tensorlist = _r.args[0];
+Py_INCREF(self_tensorlist);
+return self_tensorlist;
+"""
+            else:
+                return_stmt = "Py_RETURN_NONE;"
+            return f"""\
+{schema_comment}
+{inits}
+auto dispatch_{name} = []({lambda_formals}) -> {lambda_return} {{
+  pybind11::gil_scoped_release no_gil;
+  {dispatch_callee}({dispatch_args});
+}};
+dispatch_{name}({lambda_args}){set_requires_grad};
+{return_stmt}
+"""
+        else:
+            typename = structseq_typenames.get(gen_structseq_typename_key(f))
+            structseq_typeref = f"{typename}, " if typename is not None else ""
+            return f"""\
+{schema_comment}
+{inits}
+auto dispatch_{name} = []({lambda_formals}) -> {lambda_return} {{
+  pybind11::gil_scoped_release no_gil;
+  return {dispatch_callee}({dispatch_args});
+}};
+return wrap({structseq_typeref}dispatch_{name}({lambda_args}){set_requires_grad});
+"""
+
+    return go(f)
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_trace_type.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_trace_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..853970686bd8ac4074d7790d0f4cbb0587635553
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_trace_type.py
@@ -0,0 +1,535 @@
+import itertools
+from typing import Dict, List, Sequence, Union
+
+from torchgen.api import cpp
+from torchgen.api.types import DispatcherSignature
+from torchgen.code_template import CodeTemplate
+from torchgen.context import with_native_function
+from torchgen.model import Argument, NativeFunction, SchemaKind, TensorOptionsArguments
+from torchgen.utils import FileManager
+
+# Note [Manual Backend kernels]
+# For these ops, we want to manually register to dispatch key Backend and
+# skip codegen-ed registeration to all keys before Backend.
+# For codegen this means:
+#   - op set below must match ops with manual_kernel_registration=True in native_functions.yaml
+#     where we skip codegen backend kernels
+#   - all ops below are part of MANUAL_AUTOGRAD to skip codegen Autograd kernel registration
+#   - all ops below are part of MANUAL_TRACER to skip codegen Tracer kernel registration
+# Note: we still register to dispatch key Profiler for these ops, keeping it untouched for now.
+# You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
+MANUAL_BACKEND = {
+    "options",
+    "data",
+    "set_data",
+    "is_leaf",
+    "output_nr",
+    "_version",
+    "retain_grad",
+    "_backward",
+    "requires_grad_",
+}
+
+# For these ops we want to skip the codegen-ed registration to both Autograd and Tracer keys.
+# You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
+MANUAL_AUTOGRAD_AND_TRACER = {
+    "resize_",
+    "resize_as_",
+    "detach",
+    "detach_",
+    "copy_",
+    "_fw_primal",
+    "_make_dual",
+}
+
+# Currently MANUAL_AUTOGRAD and MANUAL_TRACER share the same set of ops:
+#   union(MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER)
+# You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
+MANUAL_AUTOGRAD = MANUAL_TRACER = MANUAL_BACKEND | MANUAL_AUTOGRAD_AND_TRACER
+
+# These functions we don't want to record for tracing, because we always want
+# to trace their constituent parts.  This is a temporary hack in lieue
+# of proper scopes, where subsequent compilation passes can ask for the unfolding
+# on demand.  Only concrete ATen methods can be disabled this way; it will have
+# NO EFFECT otherwise.
+DONT_RECORD_TRACE = {
+    "convolution",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "lstm_cell",
+    "gru_cell",
+    "rnn_tanh_cell",
+    "rnn_relu_cell",
+    # FIXME: figure out a better way when we support sparse tensors in jit
+    "_coalesced",
+}
+
+
+def should_trace(f: NativeFunction) -> bool:
+    # Operations involving Storage or Type are not traceable at the moment
+    if any(
+        str(arg.type) in {"Storage", "Type", "ConstQuantizerPtr"}
+        for arg in f.func.schema_order_arguments()
+    ):
+        return False
+    # We can't trace functions which don't have any Tensor or TensorList returns
+    if not any(r.type.is_tensor_like() for r in f.func.returns):
+        return False
+    return f.func.name.name.base not in DONT_RECORD_TRACE
+
+
+SELECT = CodeTemplate(
+    """\
+
+if (${cond}) {
+  ${true}
+} else {
+  ${false}
+}
+"""
+)
+
+OP_NAME = CodeTemplate(
+    """\
+op_name = c10::Symbol::fromQualString("aten::${trace_name}");
+"""
+)
+
+# These functions have their names recorded under trace renamed,
+RENAME_TRACE = {
+    "zero": "zeros_like",  # replacing aten::zero_ with aten::zeros_like
+    "fill": "full_like",  # replacing aten::fill_ with aten::full_like
+}
+
+
+def format_trace_op_name(f: NativeFunction) -> str:
+    # TODO: byte-for-byte compatible with old codegen behavior - should clean up
+    if (
+        f.func.kind() in (SchemaKind.functional, SchemaKind.out)
+        or f.func.name.name.dunder_method
+    ):
+        # special case for *_out functions: the in-place and out-of-place ops
+        # are overloaded with the same name in the JIT
+        trace_name = str(f.func.name.name)
+        trace_name = RENAME_TRACE.get(trace_name, trace_name)
+        return OP_NAME.substitute(trace_name=trace_name)
+
+    # otherwise, this is an in-place op and we need to emit both in- and
+    # out-of-place versions
+    outplace_trace_name = f.func.name.name.base
+    inplace_trace_name = cpp.name(f.func)
+    outplace_trace_name = RENAME_TRACE.get(outplace_trace_name, outplace_trace_name)
+    inplace_trace_name = RENAME_TRACE.get(inplace_trace_name, inplace_trace_name)
+
+    return SELECT.substitute(
+        cond="tracer_state->force_outplace",
+        true=OP_NAME.substitute(trace_name=outplace_trace_name),
+        false=OP_NAME.substitute(trace_name=inplace_trace_name),
+    )
+
+
+ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${name}", ${input});""")
+
+
+def format_trace_inputs(f: NativeFunction) -> str:
+    def dispatch_trace_input(
+        arg: Union[Argument, TensorOptionsArguments]
+    ) -> Sequence[str]:
+        if isinstance(arg, TensorOptionsArguments):
+            name = "options"
+            return [
+                ADD_TRACE_INPUT.substitute(
+                    name=name, input="c10::optTypeMetaToScalarType(options.dtype_opt())"
+                ),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.layout()"),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.device()"),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.pinned_memory()"),
+            ]
+        else:
+            name = arg.name
+            if str(arg.type) == "Tensor?[]":
+                return [f'jit::tracer::addInputs(node, "{name}", {name});']
+            else:
+                return [ADD_TRACE_INPUT.substitute(name=name, input=name)]
+
+    args: List[Union[Argument, TensorOptionsArguments]] = list(
+        f.func.schema_order_arguments()
+    )
+
+    if f.func.is_out_fn():
+        # *_out functions take the result as a separate argument, but we don't want to
+        # trace that argument directly. Instead, we trace its TensorOptions.
+        # So first, we need to remove the out argument from the list of arguments to trace.
+        num_out_args = len(f.func.arguments.out)
+        args = args[:-num_out_args]
+
+    trace_inputs = itertools.chain.from_iterable(
+        dispatch_trace_input(arg) for arg in args
+    )
+
+    if f.func.is_out_fn():
+        # for *_out functions, handle the result argument differently for inplace/outplace.
+        # For inplace: just add the input to the end to confirm with the JIT schema
+        inplace = [
+            ADD_TRACE_INPUT.substitute(
+                name=f.func.arguments.out[i].name, input=f.func.arguments.out[i].name
+            )
+            for i in range(num_out_args)
+        ]
+
+        # for outplace: do nothing, except if the function is a factory.
+        # Factories are a bit special because their out-of-place overloads
+        # take an extra TensorOptions argument, which is missing in the _out function
+        has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns)
+        has_tensor_input_arg = any(
+            a.type.is_tensor_like() for a in f.func.arguments.flat_non_out
+        )
+        is_factory_method = f.category_override == "factory" or (
+            has_tensor_return and not has_tensor_input_arg
+        )
+
+        # HACK: preserve old codegen behavior - the old codegen set the `is_factory_method`
+        # flag for the whole family of ops with the same basename if any of them is a
+        # factory method. For most cases the whole family of ops are indeed all factory
+        # method - 'normal' is the only exception. So we handle it specially here to avoid
+        # cloning the old logic.
+        if f.func.name.name.base == "normal":
+            is_factory_method = True
+
+        if is_factory_method:
+            outplace = [
+                ADD_TRACE_INPUT.substitute(
+                    name="out",
+                    input="c10::optTypeMetaToScalarType(out.options().dtype_opt())",
+                ),
+                ADD_TRACE_INPUT.substitute(name="out", input="out.options().layout()"),
+                ADD_TRACE_INPUT.substitute(name="out", input="out.options().device()"),
+                ADD_TRACE_INPUT.substitute(
+                    name="out", input="out.options().pinned_memory()"
+                ),
+            ]
+        else:
+            outplace = []
+
+        trace_inputs = itertools.chain(
+            trace_inputs,
+            [
+                SELECT.substitute(
+                    cond="tracer_state->force_outplace",
+                    true="\n".join(outplace),
+                    false="\n".join(inplace),
+                )
+            ],
+        )
+
+    return "\n".join(trace_inputs)
+
+
+# `torch.jit.trace` have undocumented keyword argument `_force_outplace`,
+# which force jit to replace functions with outplace variants (for
+# example `aten::add_` becomes `aten::add`).
+#
+# This replacement implemented in-place with minimum modifications of
+# arguments stack (as it assumes that outplace call has the same arguments
+# as inplace version).
+#
+# However there are no such substitutions available for `aten::fill_`
+# and `aten::zero_` operators, as we never implemented `aten::fill`
+# and `aten::zero`. So jit tracing hack replacing `aten::zero_` with
+# `aten::zeros_like` and replacing `aten::fill_` with `aten::full_like`.
+#
+# But as they potentially can have different arguments, we also have
+# to hack into the stack and add missing ones.
+#
+# A possible alternative would be:
+#
+#  - Add `aten::fill` and `aten::zero`
+#
+#  - Or keep `aten::zeros_like` arguments aligned with `aten::zero_`
+# arguments (inside of the `native_functions.yaml`)
+RENAME_TRACE_ADD_ARGS = {
+    "fill": """\
+    jit::tracer::addInputs(node, "options", c10::optional<ScalarType>());
+    jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt));
+    c10::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
+    jit::tracer::addInputs(node, "memory_format", memory_format);
+""",
+    "zero": """\
+    jit::tracer::addInputs(node, "options", c10::optional<ScalarType>());
+    jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt));
+    c10::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
+    jit::tracer::addInputs(node, "memory_format", memory_format);
+""",
+}
+
+INPLACE_GUARD = CodeTemplate(
+    """\
+jit::tracer::ensureUniqueIfOutOfPlaced("${name}", ${mutable_input});
+"""
+)
+
+PRE_RECORD_TRACE = CodeTemplate(
+    """\
+torch::jit::Node* node = nullptr;
+std::shared_ptr<jit::tracer::TracingState> tracer_state;
+if (jit::tracer::isTracing()) {
+  tracer_state = jit::tracer::getTracingState();
+  at::Symbol op_name;
+  ${set_op_name}
+  node = tracer_state->createNode(op_name, /*num_outputs=*/0);
+  jit::tracer::recordSourceLocation(node);
+  ${add_trace_inputs}
+  tracer_state->insertNode(node);
+  ${inplace_guard}
+  jit::tracer::setTracingState(nullptr);
+}
+"""
+)
+
+
+def format_prerecord_trace(f: NativeFunction) -> str:
+    if not should_trace(f):
+        return ""
+
+    # TODO: clean up old codegen behavior
+    is_inplace = (
+        f.func.kind() in (SchemaKind.inplace, SchemaKind.out)
+        and not f.func.name.name.dunder_method
+    )
+    add_args = (
+        RENAME_TRACE_ADD_ARGS.get(f.func.name.name.base, "") if is_inplace else ""
+    )
+    additional_inputs = (
+        SELECT.substitute(
+            cond="tracer_state->force_outplace",
+            true=add_args,
+            false="",
+        )
+        if add_args
+        else ""
+    )
+
+    return PRE_RECORD_TRACE.substitute(
+        set_op_name=format_trace_op_name(f),
+        add_trace_inputs=format_trace_inputs(f) + additional_inputs,
+        inplace_guard=INPLACE_GUARD.substitute(
+            name=cpp.name(f.func),
+            mutable_input=f.func.arguments.out[0].name
+            if f.func.arguments.out
+            else "self",
+        )
+        if is_inplace
+        else "",
+    )
+
+
+POST_RECORD_TRACE = CodeTemplate(
+    """\
+if (tracer_state) {
+  jit::tracer::setTracingState(std::move(tracer_state));
+  ${add_trace_outputs}
+}
+"""
+)
+
+
+def format_postrecord_trace(f: NativeFunction) -> str:
+    if not should_trace(f):
+        return ""
+
+    # For outplacing ops, *_out overloads require special handling to move the
+    # output *argument* to a return value
+    if f.func.is_out_fn():
+        output_names_outplace = [arg.name for arg in f.func.arguments.out]
+        output_names_inplace = cpp.return_names(f)
+
+        # Code size optimization: the common case is that the return value is
+        # the same for both variants
+        if output_names_outplace == output_names_inplace:
+            outputs = [
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_outplace
+            ]
+            return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs)
+
+        selection = SELECT.substitute(
+            cond="force_outplace",
+            true="\n".join(
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_outplace
+            ),
+            false="\n".join(
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_inplace
+            ),
+        )
+        return POST_RECORD_TRACE.substitute(add_trace_outputs=selection)
+    else:
+        output_names = cpp.return_names(f)
+        outputs = [f"jit::tracer::addOutput(node, {n});" for n in output_names]
+        return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs)
+
+
+def tie_return_values(f: NativeFunction) -> str:
+    if len(f.func.returns) == 1:
+        return f'auto {f.func.returns[0].name or "result"}'
+    names = cpp.return_names(f)
+    return f'auto [{", ".join(names)}]'
+
+
+def get_return_value(f: NativeFunction) -> str:
+    names = cpp.return_names(f)
+    if len(f.func.returns) == 1:
+        return names[0]
+    if f.func.kind() == SchemaKind.out:
+        return f'std::forward_as_tuple({", ".join(names)})'
+    else:
+        moved = ", ".join(f"std::move({name})" for name in names)
+        return f"std::make_tuple({moved})"
+
+
+TRACE_DISPATCH = CodeTemplate(
+    """\
+${assign_return_values}at::_ops::${unambiguous_name}::redispatch(${unpacked_args});"""
+)
+
+
+def emit_trace_body(f: NativeFunction) -> List[str]:
+    trace_body: List[str] = []
+
+    trace_body.append(format_prerecord_trace(f))
+
+    dispatcher_sig = DispatcherSignature.from_schema(f.func)
+    dispatcher_exprs = dispatcher_sig.exprs()
+
+    # code-generated tracing kernels plumb and recompute dispatch keys directly through the kernel for performance.
+    # See Note [Plumbing Keys Through The Dispatcher] for details.
+    dispatch_key_set = "ks & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::Tracer)"
+    redispatch_args = ", ".join([dispatch_key_set] + [a.expr for a in dispatcher_exprs])
+
+    assign_return_values = (
+        f"{tie_return_values(f)} = "
+        if f.func.kind() in [SchemaKind.functional, SchemaKind.mutable]
+        and f.func.returns
+        else ""
+    )
+
+    # Note that this calls the slow, dispatching variants of manual_cpp_binding ops.
+    # We could probably work harder to ensure that the fast variants are
+    # called instead, but the perf benefit would be minimal.
+    trace_body.append(
+        TRACE_DISPATCH.substitute(
+            assign_return_values=assign_return_values,
+            unambiguous_name=f.func.name.unambiguous_name(),
+            unpacked_args=redispatch_args,
+        )
+    )
+
+    trace_body.append(format_postrecord_trace(f))
+    if f.func.returns:
+        trace_body.append(f"return {get_return_value(f)};")
+    return trace_body
+
+
+METHOD_DEFINITION = CodeTemplate(
+    """\
+${return_type} ${type_wrapper_name}(${formals}) {
+  ${type_definition_body}
+}
+"""
+)
+
+
+def type_wrapper_name(f: NativeFunction, key: str = "Default") -> str:
+    if f.func.name.overload_name:
+        name = f"{cpp.name(f.func)}_{f.func.name.overload_name}"
+    else:
+        name = cpp.name(f.func)
+
+    # The key argument is only used in gen_variable_type where we need fns per autograd dispatch key.
+    # In gen_trace_type and gen_inplace_view_type where only one fn per native_fn must be generated,
+    # the key argument should not be passed.
+    # We do not append key if it is Default so that generated functions from
+    # before per-dispatch-key derivatives were added retain the same names.
+    if key != "Default":
+        name = name + f"_{key}"
+    return name
+
+
+@with_native_function
+def method_definition(f: NativeFunction) -> str:
+    assert cpp.name(f.func) not in MANUAL_TRACER
+
+    formals = ", ".join(
+        # code-generated tracing kernels plumb and recompute dispatch keys directly through the kernel for performance.
+        # See Note [Plumbing Keys Through The Dispatcher] for details.
+        ["c10::DispatchKeySet ks"]
+        + [
+            f'{cpp.argument_type(a, binds="__placeholder__", symint=True).cpp_type()} {a.name}'
+            for a in f.func.schema_order_arguments()
+        ]
+    )
+
+    return METHOD_DEFINITION.substitute(
+        return_type=cpp.returns_type(f.func.returns, symint=True).cpp_type(),
+        type_wrapper_name=type_wrapper_name(f),
+        formals=formals,
+        type_definition_body=emit_trace_body(f),
+    )
+
+
+WRAPPER_REGISTRATION = CodeTemplate(
+    """\
+m.impl("${name}",
+       TORCH_FN(${class_type}::${type_wrapper_name})
+);
+"""
+)
+
+
+@with_native_function
+def method_registration(f: NativeFunction) -> str:
+    assert cpp.name(f.func) not in MANUAL_TRACER
+
+    return WRAPPER_REGISTRATION.substitute(
+        name=f.func.name,
+        type_wrapper_name=type_wrapper_name(f),
+        class_type="TraceType",
+    )
+
+
+def gen_trace_type_func(fn: NativeFunction) -> Dict[str, List[str]]:
+    return {
+        "ops_headers": [f"#include <ATen/ops/{fn.root_name}_ops.h>"],
+        "trace_method_definitions": [method_definition(fn)],
+        "trace_wrapper_registrations": [method_registration(fn)],
+    }
+
+
+def gen_trace_type(
+    out: str, native_functions: List[NativeFunction], template_path: str
+) -> None:
+    # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
+    # template regarding sharding of the generated files.
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_sharded(
+        "TraceType.cpp",
+        [fn for fn in native_functions if cpp.name(fn.func) not in MANUAL_TRACER],
+        key_fn=lambda fn: fn.root_name,
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/TraceType.cpp",
+        },
+        env_callable=gen_trace_type_func,
+        num_shards=5,
+        sharded_keys={
+            "ops_headers",
+            "trace_method_definitions",
+            "trace_wrapper_registrations",
+        },
+    )
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_variable_factories.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_variable_factories.py
new file mode 100644
index 0000000000000000000000000000000000000000..308dd99c8225bcfc82c62f1dda1a5fb3d05933cc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_variable_factories.py
@@ -0,0 +1,115 @@
+# Generates C++ functions that wrap ATen tensor factory methods to turn them into Variables.
+#
+# This writes one file: variable_factories.h
+
+import re
+from typing import List, Optional
+
+import torchgen.api.python as python
+from torchgen.api import cpp
+
+from torchgen.api.types import CppSignatureGroup
+from torchgen.context import with_native_function
+from torchgen.gen import parse_native_yaml
+from torchgen.model import NativeFunction, TensorOptionsArguments, Variant
+from torchgen.utils import FileManager, mapMaybe
+
+OPTIONAL_TYPE_PATTERN = re.compile(r"c10::optional<(.+)>")
+TYPE_PATTERN = re.compile(r"(?:const\s+)?([A-Z]\w+)")
+
+
+# Add 'at::' to types defined in ATen namespace, e.g. Tensor, TensorList, IntArrayRef and etc.
+# TODO: maybe update the cpp argument API to take optional namespace argument?
+def fully_qualified_type(argument_type: str) -> str:
+    def maybe_optional_type(type: str, is_opt: bool) -> str:
+        return f"c10::optional<{type}>" if is_opt else type
+
+    opt_match = OPTIONAL_TYPE_PATTERN.match(argument_type)
+    is_opt = opt_match is not None
+    if opt_match:
+        argument_type = argument_type[opt_match.start(1) : opt_match.end(1)]
+    match = TYPE_PATTERN.match(argument_type)
+    if match is None:
+        return maybe_optional_type(argument_type, is_opt)
+    index = match.start(1)
+    qualified_type = f"{argument_type[:index]}at::{argument_type[index:]}"
+    return maybe_optional_type(qualified_type, is_opt)
+
+
+def gen_variable_factories(
+    out: str, native_yaml_path: str, tags_yaml_path: str, template_path: str
+) -> None:
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
+    factory_functions = [fn for fn in native_functions if is_factory_function(fn)]
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_with_template(
+        "variable_factories.h",
+        "variable_factories.h",
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/variable_factories.h",
+            "ops_headers": [
+                f"#include <ATen/ops/{fn.root_name}.h>" for fn in factory_functions
+            ],
+            "function_definitions": list(mapMaybe(process_function, factory_functions)),
+        },
+    )
+
+
+@with_native_function
+def is_factory_function(f: NativeFunction) -> bool:
+    if Variant.function not in f.variants:
+        return False
+
+    name = cpp.name(f.func)
+    has_tensor_options = python.has_tensor_options(f)
+    return has_tensor_options or name.endswith("_like")
+
+
+@with_native_function
+def process_function(f: NativeFunction) -> Optional[str]:
+    name = cpp.name(f.func)
+    has_tensor_options = python.has_tensor_options(f)
+    is_factory = has_tensor_options or name.endswith("_like")
+
+    if Variant.function not in f.variants or not is_factory:
+        return None
+
+    cpp_sigs = CppSignatureGroup.from_native_function(f, method=False)
+    sigs = [cpp_sigs.signature]
+    if cpp_sigs.symint_signature is not None:
+        sigs.append(cpp_sigs.symint_signature)
+    r = ""
+    for sig in sigs:
+        formals: List[str] = []
+        exprs: List[str] = []
+        requires_grad = "false"
+        for arg in sig.arguments():
+            qualified_type = fully_qualified_type(arg.type)
+            if arg.default:
+                formals.append(f"{qualified_type} {arg.name} = {arg.default}")
+            else:
+                formals.append(f"{qualified_type} {arg.name}")
+
+            if isinstance(arg.argument, TensorOptionsArguments):
+                # note: we remove the requires_grad setting from the TensorOptions because
+                # it is ignored anyways (and we actually have an assertion that it isn't set
+                # which would fail otherwise). We handle requires_grad explicitly here
+                # instead of passing it through to the kernel.
+                exprs.append(
+                    f"at::TensorOptions({arg.name}).requires_grad(c10::nullopt)"
+                )
+                # Manually set the requires_grad bit on the result tensor.
+                requires_grad = f"{arg.name}.requires_grad()"
+            else:
+                exprs.append(arg.name)
+
+        r += f"""\
+inline at::Tensor {sig.name()}({', '.join(formals)}) {{
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::{sig.name()}({', '.join(exprs)}), /*requires_grad=*/{requires_grad});
+}}
+"""
+    return r
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_variable_type.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_variable_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de97dd007a26dcea4c287fe1f0f82679ebe534b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_variable_type.py
@@ -0,0 +1,2162 @@
+# Generates VariableType.h/cpp
+#
+# **If any changes are being made to the VariableType codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+#
+# VariableType is a subclass of at::Type that provides the binding code
+# necessary to provide a differentiable version of ATen operators. There are a
+# number of different things we could mean:
+#
+#   - Given a non-differentiable forward implementation, we might
+#     directly associate it with a backward implementation to make
+#     it differentiable.  This is the common case.
+#
+#   - Some functions don't need a backwards implementation, because
+#     backpropagation will never propagate beyond them.  There are a
+#     number of different reasons why this may be the case:
+#
+#       - The function has no differentiable inputs
+#       - The function's output is not differentiable
+#       - The function has no data dependency on its input
+#
+#   - Some function don't need a backwards implementation because they
+#     are implemented as a composition of other (differentiable) ATen
+#     functions.  These are dispatched directly to the Type superclass,
+#     which will in turn dispatch back to VariableType for its
+#     differentiable subcomponents.
+#
+import re
+from typing import Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
+
+from torchgen.api import cpp
+from torchgen.api.autograd import (
+    DifferentiableInput,
+    dispatch_strategy,
+    ForwardDerivative,
+    gen_differentiable_outputs,
+    is_differentiable,
+    NativeFunctionWithDifferentiabilityInfo,
+    SavedAttribute,
+)
+
+from torchgen.api.types import (
+    ArrayRefCType,
+    BaseCppType,
+    BaseCType,
+    Binding,
+    DispatcherSignature,
+    intArrayRefT,
+    iTensorListRefT,
+    ListCType,
+    MutRefCType,
+    OptionalCType,
+    scalarT,
+    SpecialArgName,
+    stringT,
+    symIntArrayRefT,
+    TENSOR_LIST_LIKE_CTYPES,
+    tensorListT,
+    tensorT,
+    TupleCType,
+    VectorCType,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.context import (
+    native_function_manager,
+    with_native_function,
+    with_native_function_and,
+)
+from torchgen.model import (
+    Argument,
+    BaseType,
+    ListType,
+    NativeFunction,
+    SchemaKind,
+    SelfArgument,
+    TensorOptionsArguments,
+)
+from torchgen.utils import FileManager, mapMaybe
+
+from .context import with_native_function_with_differentiability_info_and_key
+from .gen_inplace_or_view_type import (
+    ALL_VIEW_FUNCTIONS,
+    ASSIGN_RETURN_VALUE,
+    AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION,
+    gen_formals,
+    get_base_name,
+    get_view_info,
+    is_tensor_list_type,
+    is_tensor_type,
+    METHOD_DEFINITION,
+    modifies_arguments,
+    TMP_VAR,
+    unpack_args,
+    unpacked_name,
+    use_derived,
+    WRAPPER_REGISTRATION,
+)
+from .gen_trace_type import (
+    get_return_value,
+    MANUAL_AUTOGRAD_AND_TRACER,
+    MANUAL_BACKEND,
+    tie_return_values,
+    type_wrapper_name,
+)
+
+# We don't set or modify grad_fn on these methods. Generally, they return
+# tensors that have requires_grad=False. In-place functions listed here will
+# not examine or modify requires_grad or grad_fn.
+# NB: this does NOT include overload name
+DONT_REQUIRE_DERIVATIVE = {
+    # These only depend on the input Tensor's shape and device, not the data
+    "empty_like",
+    "ones_like",
+    "full_like",
+    "zeros_like",
+    "rand_like",
+    "randn_like",
+    "new_empty",
+    "new_empty_strided",
+    "new_full",
+    "new_zeros",
+    "new_ones",
+    # These are only implemented on integral types
+    "__and__",
+    "__iand__",
+    "__ilshift__",
+    "__ior__",
+    "__irshift__",
+    "__ixor__",
+    "__lshift__",
+    "__or__",
+    "__rshift__",
+    "__xor__",
+    # These work on integral data types, and hence don't require derivative
+    "_sobol_engine_draw",
+    "_sobol_engine_ff",
+    "_sobol_engine_scramble_",
+    "_sobol_engine_initialize_state_",
+    # This is an unsafe method that is meant to be out of reach of autograd.
+    "_coalesced_",
+    # Quantize functions should not record gradients
+    "quantize_per_tensor",
+    "quantize_per_channel",
+    # Functions that return integers should not have output that require gradients
+    "argmax",
+    "argmin",
+    "argsort",
+    "searchsorted",
+    "bucketize",
+    # Functions that return booleans are not differentiable
+    "isnan",
+    "isposinf",
+    "isneginf",
+    "isinf",
+    "signbit",
+    "isin",
+    "allclose",
+    # Functions return none are not differentiable
+    "record_stream",
+    # These functions are not differentiable
+    "logical_and",
+    "logical_xor",
+    "logical_not",
+    "logical_or",
+    # This function returns nested_tensor shape as a tensor that is non-differentiable
+    "_nested_tensor_size",
+    "_nested_tensor_strides",
+    "_nested_tensor_storage_offsets",
+}
+
+# The C -> R functions at the time of adding this are still being audited and tested
+# but will not error out.
+# C -> C, R -> C functions for which backward is correctly implemented and tested
+GRADIENT_IMPLEMENTED_FOR_COMPLEX = {
+    "fill",
+    "t",
+    "view",
+    "reshape",
+    "reshape_as",
+    "view_as",
+    "roll",
+    "clone",
+    "block_diag",
+    "diag_embed",
+    "repeat",
+    "expand",
+    "flip",
+    "fliplr",
+    "flipud",
+    "rot90",
+    "nanmean",
+    "nansum",
+    "transpose",
+    "permute",
+    "squeeze",
+    "unsqueeze",
+    "resize",
+    "resize_as",
+    "tril",
+    "triu",
+    "chunk",
+    "zero_",
+    "eq_",
+    "ne_",
+    "add",
+    "__radd__",
+    "sum",
+    "_conj",
+    "sin",
+    "cos",
+    "mul",
+    "sinc",
+    "sinh",
+    "cosh",
+    "__rmul__",
+    "sgn",
+    "asin",
+    "acos",
+    "sub",
+    "div",
+    "cat",
+    "view_as_complex",
+    "index_put",
+    "neg",
+    "complex",
+    "select",
+    "where",
+    "as_strided",
+    "as_strided_scatter",
+    "slice",
+    "constant_pad_nd",
+    "unbind",
+    "split",
+    "split_with_sizes",
+    "unsafe_split",
+    "split_with_sizes_backward",
+    "dot",
+    "vdot",
+    "cholesky",
+    "triangular_solve",
+    "mm",
+    "_unsafe_view",
+    "mv",
+    "outer",
+    "bmm",
+    "diagonal",
+    "alias",
+    "atan",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "logaddexp",
+    "logcumsumexp",
+    "reciprocal",
+    "tan",
+    "pow",
+    "rsqrt",
+    "tanh",
+    "tanh_backward",
+    "asinh",
+    "acosh",
+    "atanh",
+    "take",
+    "fill_",
+    "exp",
+    "exp2",
+    "expm1",
+    "nonzero",
+    "mean",
+    "std_mean",
+    "var_mean",
+    "inverse",
+    "solve",
+    "linalg_cholesky",
+    "addcmul",
+    "addcdiv",
+    "matrix_exp",
+    "linalg_matrix_exp",
+    "_linalg_eigh",
+    "cholesky_solve",
+    "linalg_qr",
+    "_linalg_svd",
+    "_fft_c2c",
+    "_fft_r2c",
+    "linalg_solve",
+    "sqrt",
+    "stack",
+    "gather",
+    "index_select",
+    "index_add_",
+    "linalg_inv",
+    "linalg_inv_ex",
+    "baddbmm",
+    "addbmm",
+    "addmm",
+    "addmv",
+    "addr",
+    "linalg_householder_product",
+    "ormqr",
+    "reflection_pad1d",
+    "reflection_pad2d",
+    "reflection_pad3d",
+    "linalg_cholesky_ex",
+    "linalg_eig",
+    "diagonal_copy",
+    "diagonal_scatter",
+    "select_backward",
+    "diagonal_backward",
+    "slice_backward",
+    "reflection_pad1d_backward",
+    "reflection_pad2d_backward",
+    "reflection_pad3d_backward",
+    "_sparse_sparse_matmul",
+    "replication_pad1d",
+    "replication_pad2d",
+    "replication_pad3d",
+    "put",
+    "put_",
+    "_to_copy",
+    "replication_pad1d_backward",
+    "replication_pad2d_backward",
+    "replication_pad3d_backward",
+    "diag",
+    "masked_scatter",
+    "masked_select",
+    "index_add",
+    "index_fill",
+    "trace",
+    "polar",
+    "cumsum",
+    "rsub",
+    "eig",
+    "lerp",
+    "linalg_vector_norm",
+    "cumprod",
+    "prod",
+    "index_copy",
+    "lu",
+    "unfold",
+    "unfold_backward",
+    "index",
+    "masked_fill",
+    "masked_scatter_backward",
+    "linalg_cross",
+    "lu_unpack",
+    "renorm",
+    "_conj_physical",
+    "linalg_lu_factor_ex",
+    "scatter",
+    "scatter_add",
+    "sigmoid",
+    "sigmoid_backward",
+    "sparse_mask",
+    "trapezoid",
+    "cumulative_trapezoid",
+    "conj_physical_",
+    "_neg_view",
+    "_reshape_alias",
+    "_reshape_copy",
+    "_linalg_det",
+    "lu_solve",
+    "linalg_solve_triangular",
+    "linalg_pinv",
+    "linalg_lstsq",
+    "unfold_copy",
+    "col2im",
+    "im2col",
+    "cholesky_inverse",
+    "to_sparse",
+    "sparse_sampled_addmm",
+    "linalg_lu",
+    "pixel_shuffle",
+    "pixel_unshuffle",
+    "linalg_lu_solve",
+    "_linalg_slogdet",
+    "_linalg_solve_ex",
+}
+
+GRADIENT_IMPLEMENTED_FOR_SPARSE_COMPLEX = {
+    "_to_dense",
+    "_coalesce",
+    "coalesce",
+    "values",
+    "_sparse_coo_tensor_with_dims_and_tensors",
+    "_sparse_addmm",
+}
+
+GRADIENT_IMPLEMENTED_FOR_COMPLEX.update(GRADIENT_IMPLEMENTED_FOR_SPARSE_COMPLEX)
+
+# Some operators invalidate the grad_accumulator. Let's reset it.
+RESET_GRAD_ACCUMULATOR = {"set_", "resize_"}
+
+# NOTE [ TensorImpl and Storage Pointer Sanity Checks ]
+#
+# We check the following properties:
+#   1) A function should never change the input tensors' underlying c10::TensorImpl
+#      pointers or c10::Storage pointers, even if it modifies its input tensors (via
+#      inplace or out-variants)
+# If the function does not modify its arguments, we also check the following properties
+# pertaining to its output:
+#   2) Its TensorImpl has use_count of 1
+#   3) If the function is a view function, it has the same StorageImpl as that of
+#      the input it is aliased with. Otherwise, its StorageImpl has use_count of 1
+#
+# The following code templates implement the checks for this invariant:
+SAVE_TENSOR_STORAGE = CodeTemplate(
+    """\
+c10::optional<Storage> ${tensor_name}_storage_saved =
+  ${tensor_name}.has_storage() ? c10::optional<Storage>(${tensor_name}.storage()) : c10::nullopt;
+"""
+)
+
+
+# If tensor_name == out_tensor_name, used to enforce (1), otherwise used for (2)
+ENFORCE_SAME_TENSOR_STORAGE = CodeTemplate(
+    """\
+if (${tensor_name}_storage_saved.has_value() &&
+    !at::impl::dispatch_mode_enabled() &&
+    !at::impl::tensor_has_dispatch(${tensor_name}))
+  TORCH_INTERNAL_ASSERT(${tensor_name}_storage_saved.value().is_alias_of(${out_tensor_name}.storage()));
+"""
+)
+
+SAVE_TENSORLIST_STORAGE = CodeTemplate(
+    """\
+std::vector<c10::optional<Storage>> ${tensorlist_name}_storage_saved(${tensorlist_name}.size());
+for (const Tensor& tensor : ${tensorlist_name})
+  ${tensorlist_name}_storage_saved.push_back(
+    tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt);
+"""
+)
+
+ENFORCE_SAME_TENSORLIST_STORAGE = CodeTemplate(
+    """\
+for (size_t i=0; i<${tensorlist_name}.size() && !at::impl::dispatch_mode_enabled(); i++) {
+  if (${tensorlist_name}_storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(${tensorlist_name}))
+    TORCH_INTERNAL_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of(${tensorlist_name}[i].storage()));
+}
+"""
+)
+
+SAVE_OPTIONALTENSORLIST_STORAGE = CodeTemplate(
+    """\
+std::vector<c10::optional<Storage>> ${tensorlist_name}_storage_saved(${tensorlist_name}.size());
+for (const c10::optional<Tensor>& tensor : ${tensorlist_name})
+  ${tensorlist_name}_storage_saved.push_back(
+    tensor.has_value() && tensor->has_storage() ? c10::optional<Storage>(tensor->storage()) : c10::nullopt);
+"""
+)
+
+ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE = CodeTemplate(
+    """\
+for (size_t i=0; i<${tensorlist_name}.size() && !at::impl::dispatch_mode_enabled(); i++) {
+  if (${tensorlist_name}_storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(${tensorlist_name}))
+    TORCH_INTERNAL_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of(
+        static_cast<c10::optional<Tensor>>(${tensorlist_name}[i])->storage()));
+}
+"""
+)
+
+SAVE_TENSOR_IMPL = CodeTemplate(
+    """\
+c10::intrusive_ptr<TensorImpl> ${tensor_name}_impl_saved;
+if (${tensor_name}.defined()) ${tensor_name}_impl_saved = ${tensor_name}.getIntrusivePtr();
+"""
+)
+
+ENFORCE_SAME_TENSOR_IMPL = CodeTemplate(
+    """\
+if (${tensor_name}_impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name}))
+  TORCH_INTERNAL_ASSERT(${tensor_name}_impl_saved == ${tensor_name}.getIntrusivePtr());
+"""
+)
+
+ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate(
+    """\
+if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name}))
+  TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() <= 1, "function: ${fn_name}");
+"""
+)
+
+ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE = CodeTemplate(
+    """\
+if (${tensor_name}.has_storage() && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name})) {
+  TORCH_INTERNAL_ASSERT(${tensor_name}.storage().use_count() == 1, "function: ${fn_name}");
+}
+"""
+)
+
+SAVE_TENSORLIST_IMPL = CodeTemplate(
+    """\
+std::vector<c10::intrusive_ptr<TensorImpl>> ${tensorlist_name}_impl_saved(${tensorlist_name}.size());
+for (size_t i=0; i<${tensorlist_name}.size(); i++)
+  if (${tensorlist_name}[i].defined()) ${tensorlist_name}_impl_saved[i] = ${tensorlist_name}[i].getIntrusivePtr();
+"""
+)
+
+ENFORCE_SAME_TENSORLIST_IMPL = CodeTemplate(
+    """\
+for (size_t i=0; i<${tensorlist_name}.size() && !at::impl::dispatch_mode_enabled(); i++) {
+  if (${tensorlist_name}_impl_saved[i] && !at::impl::tensorlist_has_dispatch(${tensorlist_name}))
+    TORCH_INTERNAL_ASSERT(${tensorlist_name}_impl_saved[i] == ${tensorlist_name}[i].getIntrusivePtr());
+}
+"""
+)
+
+SAVE_OPTIONALTENSORLIST_IMPL = CodeTemplate(
+    """\
+std::vector<c10::intrusive_ptr<TensorImpl>> ${tensorlist_name}_impl_saved(${tensorlist_name}.size());
+for (size_t i=0; i<${tensorlist_name}.size(); i++) {
+  c10::optional<Tensor> t = ${tensorlist_name}[i];
+  if (t.has_value() && t->defined()) ${tensorlist_name}_impl_saved[i] = t->getIntrusivePtr();
+}
+"""
+)
+
+ENFORCE_SAME_OPTIONALTENSORLIST_IMPL = CodeTemplate(
+    """\
+for (size_t i=0; i<${tensorlist_name}.size() && !at::impl::dispatch_mode_enabled(); i++) {
+  if (${tensorlist_name}_impl_saved[i])
+    TORCH_INTERNAL_ASSERT(
+      ${tensorlist_name}_impl_saved[i] == static_cast<c10::optional<Tensor>>(${tensorlist_name}[i])->getIntrusivePtr());
+}
+"""
+)
+
+# The following list contains functions that we don't enforce the invariant on.
+DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE = {
+    # These functions are expected to change impl or storage of input tensors
+    "set_",
+    "_cudnn_rnn_flatten_weight",
+}
+DONT_ENFORCE_TENSOR_IMPL_USE_COUNT = {
+    # These non-inplace, non-out functions return tensors with use_count > 1
+    # Therefore, they MAY (but not necessarily) return one of its inputs as-is
+    # See https://github.com/pytorch/pytorch/issues/60426 for more information
+    "_embedding_bag",
+    "_embedding_bag_forward_only",
+    "q_per_channel_scales",
+    "q_per_channel_zero_points",
+    "lu_unpack",
+    "_cudnn_rnn_backward",
+    # The below failed StorageImpl use_count check but we skip tensor_impl check
+    # just in case
+    "_cudnn_rnn",
+    "dequantize_self",
+    # lift() should never actually be called with a requires_grad=True tensor,
+    "lift",
+    "lift_fresh",
+    "lift_fresh_copy",
+    # Nested Tensors related functions
+    # _nested_tensor_size() should never actually be called with requires_grad=True tensor
+    "_nested_tensor_size",
+    "_nested_tensor_strides",
+    "_nested_tensor_storage_offsets",
+}
+
+DONT_ENFORCE_STORAGE_IMPL_USE_COUNT = {
+    # These non-view functions return tensors with storage use_count != 1
+    "_slow_conv2d_forward",
+    "slow_conv3d_forward",
+    "channel_shuffle",
+    # If an input is returned as-is in output, we cannot guarantee its storage_impl
+    # use count to be 1 either.
+    *DONT_ENFORCE_TENSOR_IMPL_USE_COUNT,
+}
+# END CHECKS FOR [ TensorImpl and Storage Pointer Sanity Checks ]
+
+DECLARE_GRAD_FN = CodeTemplate(
+    """\
+std::shared_ptr<${op}> grad_fn;
+"""
+)
+
+DECLARE_VECTOR_OF_GRAD_FN = CodeTemplate(
+    """\
+std::vector<std::shared_ptr<${op}>> grad_fns;
+"""
+)
+
+SETUP_ANY_REQUIRES_GRAD = CodeTemplate(
+    """\
+[[maybe_unused]] auto _any_requires_grad = compute_requires_grad( ${args_with_derivatives} );
+${extra_differentiability_conditions}
+"""
+)
+
+SETUP_DERIVATIVE = CodeTemplate(
+    """\
+if (_any_requires_grad) {
+  ${setup}
+}
+"""
+)
+
+SETUP_NONE_REQUIRES_GRAD = CodeTemplate(
+    """\
+if (compute_requires_grad( ${args_to_check} )) {
+  throw_error_out_requires_grad("${base_name}");
+}
+"""
+)
+
+ASSIGN_GRAD_FN = CodeTemplate(
+    """\
+grad_fn = std::shared_ptr<${op}>(new ${op}(${op_ctor}), deleteNode);
+grad_fn->set_next_edges(collect_next_edges( ${args_with_derivatives} ));
+"""
+)
+
+# note(crcrpar): `compute_requires_grad` in the template below is supplied with arguments indexed with `i`
+# while the `SETUP_ANY_REQUIRES_GRAD` above takes whole tensors and scalars.
+ASSIGN_VECTOR_OF_GRAD_FN = CodeTemplate(
+    """\
+for (const auto& i : c10::irange( ${irange} )) {
+  const auto ith_requires_grad = compute_requires_grad(${args_with_derivatives});
+  check_inplace(self[i], ith_requires_grad);
+  grad_fns.push_back([&]() -> std::shared_ptr<${op}> {
+      if (!ith_requires_grad) {
+          return nullptr;
+      } else {
+          auto grad_fn = std::shared_ptr<${op}>(new ${op}(${op_ctor}), deleteNode);
+          grad_fn->set_next_edges(collect_next_edges( ${args_with_derivatives} ));
+          return grad_fn;
+      }
+  }());
+}
+"""
+)
+
+CALL_REDISPATCH = CodeTemplate(
+    """\
+at::redispatch::${api_name}(${unpacked_args})"""
+)
+# If the non-variable operation has return values, we use the `tmp` variable to hold the
+# values temporarily and pass the values to the return variables outside of the
+# `at::AutoDispatchBelowAutograd` guard block.
+DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES_JVP_DECOMP = CodeTemplate(
+    """\
+auto ${tmp_var} = ([&]() {
+  if (${any_has_forward_grad}) {
+    static c10::OperatorName full_name("aten::${op_name}", "${op_overload}");
+    static c10::optional<c10::OperatorHandle> opt_op = c10::Dispatcher::singleton().findSchema(full_name);
+    return impl::run_jit_decomposition_with_args_for_jvp<${return_types}>("${op_name}", *opt_op, ks, ${arg_names});
+  } else {
+    ${guard}
+    return ${base_type_call};
+  }
+})();
+"""
+)
+
+DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES = CodeTemplate(
+    """\
+auto ${tmp_var} = ([&]() {
+  ${guard}
+  return ${base_type_call};
+})();
+"""
+)
+
+DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES = CodeTemplate(
+    """\
+{
+  ${guard}
+  ${base_type_call};
+}
+"""
+)
+
+SET_HISTORY = CodeTemplate(
+    """\
+if (grad_fn) {
+    ${fn}_history(${differentiable_outputs}, grad_fn);
+}
+"""
+)
+
+LOOP_OVER_VECTOR_OF_GRAD_FNS = CodeTemplate(
+    """\
+if (!grad_fns.empty()) {
+    ${preamble}
+    for (const auto& i : c10::irange(grad_fns.size())) {
+        auto grad_fn = grad_fns[i];
+        if (grad_fn != nullptr) {
+            ${statements}
+        }
+    }
+}
+"""
+)
+
+CONDITIONAL = CodeTemplate(
+    """\
+if (${cond}) {
+  ${statements}
+}
+"""
+)
+
+RUN_ONLY_IN_DEBUG_MODE = CodeTemplate(
+    """\
+#ifndef NDEBUG
+${statements}
+#endif
+"""
+)
+
+FW_DERIVATIVE_CHECK_TEMPLATE = CodeTemplate(
+    """\
+isFwGradDefined(${req_inp})\
+"""
+)
+FW_DERIVATIVE_SIZE_CHECK_TEMPLATE = CodeTemplate(
+    """\
+TORCH_CHECK(
+    self.size() == ${inp_name}.size(),
+      "Tensor lists must have the same number of tensors, got ",
+    self.size(),
+      " and ",
+    ${inp_name}.size());
+"""
+)
+
+FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE = CodeTemplate(
+    """\
+isFwGradDefinedTensorList(${req_inp})\
+"""
+)
+
+FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE = CodeTemplate(
+    """\
+auto ${inp_name}_t_raw = toNonOptFwGrad(${inp});
+auto ${inp_name}_tensor = toNonOptTensor(${inp});
+auto ${inp_name}_t = (${inp_name}_t_raw.defined() || !${inp_name}_tensor.defined())
+  ? ${inp_name}_t_raw : at::${zeros_fn}(${inp_name}_tensor.sizes(), ${inp_name}_tensor.options());
+"""
+)
+
+FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE = CodeTemplate(
+    """\
+auto ${inp_name}_p = toNonOptPrimal(${inp});
+"""
+)
+
+FW_DERIVATIVE_SETTER_TENSOR = CodeTemplate(
+    """\
+if (${out_arg}_new_fw_grad_opt.has_value() && ${out_arg}_new_fw_grad_opt.value().defined() && ${out_arg}.defined()) {
+  // The hardcoded 0 here will need to be updated once we support multiple levels.
+  ${out_arg}._set_fw_grad(${out_arg}_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ ${is_inplace});
+}
+"""
+)
+
+FW_DERIVATIVE_SETTER_TENSOR_FOREACH = CodeTemplate(
+    """\
+for (const auto& i : c10::irange(${out_arg}_new_fw_grad_opts.size())) {
+  auto& ${out_arg}_new_fw_grad_opt = ${out_arg}_new_fw_grad_opts[i];
+  if (${out_arg}_new_fw_grad_opt.has_value() && ${out_arg}_new_fw_grad_opt.value().defined() && ${out_arg}[i].defined()) {
+    // The hardcoded 0 here will need to be updated once we support multiple levels.
+    ${out_arg}[i]._set_fw_grad(${out_arg}_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ ${is_inplace});
+  }
+}
+"""
+)
+
+FW_DERIVATIVE_SETTER_MULTI_OUTPUT = CodeTemplate(
+    """\
+if (${all_res}_new_fw_grad_opt.has_value() && std::get<${idx}>(${all_res}_new_fw_grad_opt.value()).defined()
+    && ${out_arg}.defined()) {
+  ${out_arg}._set_fw_grad(std::get<${idx}>(${all_res}_new_fw_grad_opt.value()), /* level */ 0, /* is_inplace_op */ false);
+}
+"""
+)
+
+FW_DERIVATIVE_SETTER_TENSOR_LIST = CodeTemplate(
+    """\
+if (${out_arg}_new_fw_grad_opt.has_value()) {
+  auto ${out_arg}_new_fw_grad = ${out_arg}_new_fw_grad_opt.value();
+  TORCH_INTERNAL_ASSERT(${out_arg}.size() == ${out_arg}_new_fw_grad.size());
+  for (const auto i : c10::irange(${out_arg}.size())) {
+    if (${out_arg}_new_fw_grad[i].defined() && ${out_arg}[i].defined()) {
+      // The hardcoded 0 here will need to be updated once we support multiple levels.
+      ${out_arg}[i]._set_fw_grad(${out_arg}_new_fw_grad[i], /* level */ 0, /* is_inplace_op */ ${is_inplace});
+    }
+  }
+}
+"""
+)
+
+FW_DERIVATIVE_TEMPLATE = CodeTemplate(
+    """\
+${fw_grad_opt_definition}
+if (${requires_fw_grad}) {
+    ${unpacked_arguments}
+    ${out_arg}_new_fw_grad_opt = ${formula};
+}
+"""
+)
+
+FW_DERIVATIVE_FOREACH_TEMPLATE = CodeTemplate(
+    """\
+${fw_grad_opt_definition}
+for (const auto& i : c10::irange(${vector_of_optional_tensor}.size())) {
+  if (${any_has_forward_grad_for_current_index}) {
+      ${unpacked_arguments}
+      ${vector_of_optional_tensor}[i] = ${formula};
+  }
+}
+"""
+)
+
+FW_DERIVATIVE_FORBID_TEMPLATE = CodeTemplate(
+    """\
+TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${name} that does not support it ${msg}");
+"""
+)
+
+FW_DERIVATIVE_FORBID_LIST_TEMPLATE = CodeTemplate(
+    """\
+for (const auto& _t: ${arg}) {
+    TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${name} that does not support it ${msg}");
+}
+"""
+)
+
+
+def gen_variable_type(
+    out: str,
+    native_yaml_path: str,
+    tags_yaml_path: str,
+    fns_with_diff_infos: List[NativeFunctionWithDifferentiabilityInfo],
+    template_path: str,
+    used_keys: Set[str],
+) -> None:
+    """VariableType.h and VariableType.cpp body
+
+    This is the at::Type subclass for differentiable tensors. The
+    implementation of each function dispatches to the base tensor type to
+    compute the output. The grad_fn is attached to differentiable functions.
+    """
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write(
+        "VariableType.h",
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/VariableType.h"
+        },
+    )
+
+    # helper that generates a TORCH_LIBRARY_IMPL macro for each
+    # dispatch key that appears in derivatives.yaml
+    def wrapper_registrations(used_keys: Set[str]) -> str:
+        library_impl_macro_list: List[str] = []
+        for key in sorted(used_keys):
+            dispatch_key = key
+            if key == "Default":
+                dispatch_key = "Autograd"
+            library_impl_macro = (
+                f"TORCH_LIBRARY_IMPL(aten, {dispatch_key}, m) "
+                + "{\n"
+                + "${"
+                + f"wrapper_registrations_{key}"
+                + "}\n}"
+            )
+            library_impl_macro_list += [library_impl_macro]
+        return "\n\n".join(library_impl_macro_list)
+
+    # Generate a new template from VariableType.cpp which replaces ${wrapper_registrations}
+    # with per key TORCH_LIBRARY_IMPL macros for each key that appears in derivatives.yaml
+    fm1 = FileManager(
+        install_dir=out + "/templates", template_dir=template_path, dry_run=False
+    )
+    fm1.write(
+        "VariableType.cpp",
+        lambda: {
+            "type_derived_method_definitions": "\n\n".join(
+                [
+                    "${" + f"type_derived_method_definitions_{key}" + "}"
+                    for key in sorted(used_keys)
+                ]
+            ),
+            "wrapper_registrations": wrapper_registrations(used_keys),
+        },
+    )
+
+    # Generate final VariableType_*.cpp files from the generated template
+    fm2 = FileManager(install_dir=out, template_dir=out + "/templates", dry_run=False)
+
+    sharded_keys = set(
+        [f"type_derived_method_definitions_{key}" for key in sorted(used_keys)]
+        + [f"wrapper_registrations_{key}" for key in sorted(used_keys)]
+    )
+    # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
+    # template regarding sharding of the generated files.
+    fm2.write_sharded(
+        "VariableType.cpp",
+        [fn for fn in fns_with_diff_infos if use_derived(fn)],
+        key_fn=lambda fn: cpp.name(fn.func.func),
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/VariableType.cpp",
+        },
+        env_callable=gen_variable_type_func,
+        num_shards=5,
+        sharded_keys=sharded_keys,
+    )
+
+
+@with_native_function_and
+def gen_wrapper_registration(f: NativeFunction, key: str = "Default") -> str:
+    return WRAPPER_REGISTRATION.substitute(
+        unqual_operator_name_with_overload=f.func.name,
+        type_wrapper_name=type_wrapper_name(f, key),
+        class_type="VariableType",
+    )
+
+
+def gen_variable_type_func(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> Dict[str, List[str]]:
+    f = fn.func
+    result = {}
+    with native_function_manager(f):
+        name = cpp.name(f.func)
+        formals = gen_formals(f)
+
+        if (
+            fn.info is None
+            and str(f.func.name.name) not in RESET_GRAD_ACCUMULATOR
+            and get_base_name(f) not in DONT_REQUIRE_DERIVATIVE
+            and len(gen_differentiable_outputs(fn)) > 0
+            and cpp.name(f.func) not in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE
+            and type_wrapper_name(f) not in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT
+            and type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT
+        ):
+            # NOTE: [ Registering AutogradNotImplemented boxed kernel ]
+            #
+            # When there is no derivatives.yaml entry, we register a generic boxed
+            # NotImplemented kernel to set grad_fn to be NotImplemented, so that forward
+            # proceeds as usual but an error is properly produced on backward.
+            # TODO: it would be nice to not have these special cases
+            #
+            # There are several cases where still let codegen handle it:
+            # 1) ops that need to reset grad accumulator (we let codegen handle this case
+            #     because) the list is (currently) only accessible in Python.
+            # 2) User explicitly specifies DONT_REQUIRE_DERIVATIVE. This basically makes
+            #    autograd a fallthrough with NDEBUG checks. This can be useful for when all
+            #    outputs are integral.
+            # 3) When there are no differentiable outputs. This is similar to (2).
+            # 4) There are certain ops where we skip certain NDEBUG checks. this is similar
+            #    to (1).
+            type_definition = ""
+            wrapper_registration = AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION.substitute(
+                unqual_operator_name_with_overload=f.func.name
+            )
+            result["type_derived_method_definitions_Default"] = [type_definition]
+            result["wrapper_registrations_Default"] = [wrapper_registration]
+        else:
+            if not fn.info:
+                key = "Default"
+                type_definition = METHOD_DEFINITION.substitute(
+                    return_type=cpp.returns_type(
+                        f.func.returns, symint=True
+                    ).cpp_type(),
+                    type_wrapper_name=type_wrapper_name(f, key),
+                    type_definition_body=emit_body(fn, key),
+                    formals=formals,
+                )
+                wrapper_registration = gen_wrapper_registration(f, key)
+                result[f"type_derived_method_definitions_{key}"] = [type_definition]
+                result[f"wrapper_registrations_{key}"] = [wrapper_registration]
+            else:
+                for key in fn.info.keys():
+                    type_definition = METHOD_DEFINITION.substitute(
+                        return_type=cpp.returns_type(
+                            f.func.returns, symint=True
+                        ).cpp_type(),
+                        type_wrapper_name=type_wrapper_name(f, key),
+                        type_definition_body=emit_body(fn, key),
+                        formals=formals,
+                    )
+                    wrapper_registration = gen_wrapper_registration(f, key)
+                    result[f"type_derived_method_definitions_{key}"] = [type_definition]
+                    result[f"wrapper_registrations_{key}"] = [wrapper_registration]
+    # See Note [Manual Backend kernels]
+    assert (name in MANUAL_BACKEND) == f.manual_kernel_registration
+    # If you want to register a kernel to Autograd, you must make the op abstract.
+    # In other words, this op must have dispatch section in native_functions.yaml.
+    if name in MANUAL_AUTOGRAD_AND_TRACER or (
+        fn.info and any(info.has_derivatives for info in fn.info.values())
+    ):
+        msg = (
+            f"There's a formula for {name}(or its functional variant) in derivatives.yaml. "
+            f"It's required to add a dispatch section for it with explicit supported backends e.g CPU/CUDA "
+            f"or CompositeExplicitAutograd in native_functions.yaml. Please see "
+            f"https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native#choosing-the-right-dispatch-keyword "
+            f"for instructions to choose the right dispatch keyword."
+        )
+        assert f.is_abstract, msg
+
+    return result
+
+
+_foreach_ops_without_differentiability_info = {
+    # No reference backward available due to the lack of `{maximum, minimum}(tensor, scalar)`.
+    ("_foreach_maximum", "Scalar"),
+    ("_foreach_maximum", "ScalarList"),
+    ("_foreach_minimum", "Scalar"),
+    ("_foreach_minimum", "ScalarList"),
+    # No reference backward available as addcdiv/addcmul don't support Tensor as scaling factor.
+    ("_foreach_addcdiv", "Tensor"),
+    ("_foreach_addcmul", "Tensor"),
+    ("_foreach_copy", ""),
+}
+
+_foreach_ops_with_different_arity = {
+    # These ops lack `alpha` of scaling factor to applied to the right hand side argument.
+    ("_foreach_add", "Scalar"),
+    ("_foreach_add", "ScalarList"),
+    ("_foreach_sub", "Scalar"),
+    ("_foreach_sub", "ScalarList"),
+}
+
+
+@with_native_function_with_differentiability_info_and_key
+def emit_body(
+    fn: NativeFunctionWithDifferentiabilityInfo, key: str = "Default"
+) -> List[str]:
+    assert dispatch_strategy(fn) == "use_derived"
+    f = fn.func
+    info = fn.info[key] if fn.info else None
+    fw_derivatives = fn.fw_derivatives.get(key, []) if fn.fw_derivatives else []
+
+    name = cpp.name(f.func)
+    inplace = f.func.kind() == SchemaKind.inplace
+    is_out_fn = f.func.kind() == SchemaKind.out
+    returns_void = len(f.func.returns) == 0
+    base_name = get_base_name(f)
+    view_info = get_view_info(f)
+
+    is_foreach = name.startswith("_foreach")
+    is_inplace_foreach = is_foreach and inplace
+    if is_inplace_foreach:
+        inplace_foreacharg2refarg: Dict[Argument, Argument] = {}
+        refargname2inplace_foreacharg: Dict[str, Argument] = {}
+        base_name_and_overload_name = (f.func.name.name.base, f.func.name.overload_name)
+        if info is None:
+            assert (
+                base_name_and_overload_name
+                in _foreach_ops_without_differentiability_info
+            ), f"{'.'.join(base_name_and_overload_name)} should have a differentiability info"
+        else:
+            assert (
+                len(f.func.arguments.flat_non_out)
+                == len(info.func.func.arguments.flat_non_out)
+            ) or (base_name_and_overload_name in _foreach_ops_with_different_arity), (
+                f"{'.'.join(base_name_and_overload_name)} has {len(f.func.arguments.flat_non_out)} args "
+                f"but the reference has {len(info.func.func.arguments.flat_non_out)}"
+            )
+            for foreach_arg, ref_arg in zip(
+                f.func.arguments.flat_non_out, info.func.func.arguments.flat_non_out
+            ):
+                foreach_arg_type = foreach_arg.type
+                if isinstance(foreach_arg_type, ListType):
+                    foreach_arg_type = foreach_arg_type.elem
+                assert foreach_arg_type == ref_arg.type
+                inplace_foreacharg2refarg[foreach_arg] = ref_arg
+                refargname2inplace_foreacharg[ref_arg.name] = foreach_arg
+
+    def gen_differentiable_input(
+        arg: Union[Argument, SelfArgument, TensorOptionsArguments]
+    ) -> Optional[DifferentiableInput]:
+        if isinstance(arg, TensorOptionsArguments):
+            return None
+        a: Argument = arg.argument if isinstance(arg, SelfArgument) else arg
+
+        # TODO: `cpp_type` is only to keep it byte-for-byte compatible with the old codegen, should remove.
+        # NB: This is not a clone of cpp.argument() - TensorOptionsArguments / faithful / binds are
+        # not handled properly as they are irrelevant for this codegen.
+        cpp_type = cpp.argument_type(a, binds=a.name, symint=True).cpp_type()
+
+        if not is_differentiable(a.name, a.type, info):
+            return None
+        return DifferentiableInput(
+            name=a.name,
+            type=a.type,
+            cpp_type=cpp_type,
+        )
+
+    @with_native_function
+    def gen_differentiable_inputs(f: NativeFunction) -> List[DifferentiableInput]:
+        arguments = list(f.func.arguments.non_out)
+        if is_inplace_foreach and info is not None:
+            for i, arg in enumerate(f.func.arguments.flat_non_out):
+                if arg in inplace_foreacharg2refarg:
+                    # note(crcrpar): From what I understand, what matters is only the name.
+                    # Thus originally I only replace argument only when the names are different.
+                    # TODO(crcrpar): Make it simpler.
+                    mapped_arg = inplace_foreacharg2refarg[arg]
+                    arguments[i] = Argument(
+                        mapped_arg.name,
+                        mapped_arg.type,
+                        mapped_arg.default,
+                        mapped_arg.annotation,
+                    )
+        return list(mapMaybe(gen_differentiable_input, arguments))
+
+    def find_args_with_derivatives(
+        differentiable_inputs: List[DifferentiableInput],
+    ) -> List[DifferentiableInput]:
+        """Find arguments that have derivative definitions"""
+        if info is None or not info.has_derivatives:
+            return differentiable_inputs
+        names = {name for d in info.derivatives for name in d.var_names}
+        differentiable = [arg for arg in differentiable_inputs if arg.name in names]
+        if len(differentiable) != len(names):
+            missing = names - {arg.name for arg in differentiable}
+            raise RuntimeError(
+                f"Missing arguments for derivatives: {missing} in {info.name}"
+            )
+        return differentiable
+
+    differentiable_inputs = gen_differentiable_inputs(f)
+    args_with_derivatives = find_args_with_derivatives(differentiable_inputs)
+    differentiable_outputs = gen_differentiable_outputs(fn, key)
+
+    undifferentiable = (base_name in DONT_REQUIRE_DERIVATIVE) or (
+        name in DONT_REQUIRE_DERIVATIVE
+    )
+
+    requires_derivative = (
+        (not undifferentiable)
+        and (len(differentiable_inputs) > 0)
+        and (
+            (len(differentiable_outputs) > 0)
+            # note(crcrpar): In-place foreach functions are a void function.
+            or is_inplace_foreach
+        )
+    )
+
+    if (
+        info is not None
+        and info.has_derivatives
+        and not requires_derivative
+        # out= ops are allowed to have zero returns which cause requires_derivative to be False
+        # we shouldn't error out though (out= ops for autograd just redispatch)
+        and len(f.func.returns) > 0
+    ):
+        raise RuntimeError(
+            f"ERROR: derivative ignored for {name} -- specified an autograd function without derivative"
+        )
+
+    # note(crcrpar): In-place foreach functions do not support forward AD
+    if requires_derivative and len(fw_derivatives) > 0 and not is_inplace_foreach:
+        assert sum(len(derivative.var_names) for derivative in fw_derivatives) == len(
+            differentiable_outputs
+        ), (
+            "Expected the number of forward derivatives implemented to match the "
+            "number of differentiable outputs. NB: This only applies when at least "
+            "one forward derivative is implemented. Not implementing any forward "
+            "derivatives is also okay, and we would require inputs to the op to "
+            "not have associated tangents in that case."
+        )
+
+    try_jit_decomposition = (
+        requires_derivative
+        and len(fw_derivatives) == 0
+        and (not modifies_arguments(f))
+        and (not returns_void)
+    )
+
+    def emit_save_inputs() -> List[str]:
+        setup: List[str] = []
+        if info is None or not info.has_derivatives:
+            return setup
+
+        has_tensorlist_arg = any(
+            is_tensor_list_type(arg.type) for arg in args_with_derivatives
+        )
+
+        # We don't want to save tensors if we know that they will never be used
+        # when computing the derivative, so we add guards to those statements
+        def guard_for(arg: SavedAttribute) -> Optional[str]:
+            assert info is not None
+
+            # It's hard to determine the edge offset if we have TensorLists
+            # NOTE(crcrpar): in-place foreach functions' arguments include tensorlist
+            # but their derivatives don't use it, so let them bypass this check.
+            if has_tensorlist_arg and (not is_inplace_foreach):
+                return None
+
+            # Empirical evaluation of the cases where we insert those guards in
+            # backward show that they are somewhat useless. E.g. there's no need
+            # to guard on some values captured from forward, because they had to
+            # require_grad if the backward function even gets executed. I don't
+            # have any good ideas for detecting those cases, so I simply disabled the
+            # checks.
+            if "backward" in info.name:
+                return None
+
+            # If there's a single derivative we could compute, we already have
+            # a requires_grad check that is sufficient
+            if len(args_with_derivatives) <= 1:
+                return None
+
+            # We really only care about trimming down the amount of tensors we save
+            if arg.nctype.type != BaseCType(tensorT):
+                return None
+
+            # We want to emit simple guards, so we only allow that if checking one
+            # input is enough to determine whether we need that value
+            used_in = [d for d in info.derivatives if arg in d.saved_inputs]
+            assert len(used_in) > 0
+            if len(used_in) != 1:
+                return None
+            derivative = used_in[0]
+
+            # Case with multioutput formulas
+            # TODO: process all derivative formulas!!!
+            if len(derivative.var_names) != 1:
+                wrap_opt_if_start = derivative.formula.find(
+                    f"wrap_opt_if({arg.nctype.name}"
+                )
+                if wrap_opt_if_start == -1:
+                    return None
+
+                wrap_opt_if_match = re.match(
+                    rf"wrap_opt_if\({arg.nctype.name},(.*?)\)",
+                    derivative.formula[wrap_opt_if_start:],
+                )
+                assert wrap_opt_if_match is not None
+
+                # Condition is between 'wrap_opt_if(var_name,' and ')'.
+                condition_slice = slice(len(rf"wrap_opt_if\({arg.nctype.name},"), -1)
+                wrap_opt_if_condition = wrap_opt_if_match.group(0)[
+                    condition_slice
+                ].strip()
+                # replace 'grad_input_mask[num]' with 'grad_fn->should_compute_output(num)'
+                wrap_opt_if_condition = re.sub(
+                    r"grad_input_mask\[(\d+)\]",
+                    r"grad_fn->should_compute_output(\1)",
+                    wrap_opt_if_condition,
+                )
+                return f"{wrap_opt_if_condition}"
+
+            # Figure out the offset of the edge that uses this variable
+            derivative_var_name = derivative.var_names[0]
+            for edge_off, a in enumerate(args_with_derivatives):
+                if a.name == derivative_var_name:
+                    break
+            else:
+                raise AssertionError()
+            return f"grad_fn->should_compute_output({edge_off})"
+
+        if is_inplace_foreach:
+            save_input_stmts = save_variables(info.all_saved_inputs, False, guard_for)
+            if save_input_stmts:
+                setup.append(
+                    LOOP_OVER_VECTOR_OF_GRAD_FNS.substitute(
+                        preamble="", statements=save_input_stmts
+                    )
+                )
+        else:
+            setup.extend(save_variables(info.all_saved_inputs, False, guard_for))
+            for arg in args_with_derivatives:
+                if is_tensor_list_type(arg.type):
+                    setup.append(f"grad_fn->{arg.name}_size_ = {arg.name}.size();")
+        return setup
+
+    def setup_derivative(differentiable_inputs: List[DifferentiableInput]) -> List[str]:
+        body: List[str] = []
+        if is_out_fn:
+            # For out functions, ensure that no input or output requires grad
+            body.append(DECLARE_GRAD_FN.substitute(op="Node"))
+            body.append(
+                SETUP_NONE_REQUIRES_GRAD.substitute(
+                    base_name=base_name,
+                    args_to_check=[arg.name for arg in differentiable_inputs],
+                )
+            )
+            body.append(
+                SETUP_NONE_REQUIRES_GRAD.substitute(
+                    base_name=base_name,
+                    args_to_check=[arg.name for arg in differentiable_outputs],
+                )
+            )
+            return body
+
+        op = info.op if info is not None and info.has_derivatives else "NotImplemented"
+        setup = []
+        if not is_inplace_foreach:
+            setup.extend(
+                ASSIGN_GRAD_FN.substitute(
+                    op=op,
+                    op_ctor=""
+                    if info is not None and info.has_derivatives
+                    else f'"{cpp.name(f.func)}"',
+                    args_with_derivatives=[arg.name for arg in args_with_derivatives],
+                ).split("\n")
+            )
+        else:
+            # note(crcrpar): Assuming in-place foreach function's self_arg is always TensorList.
+            list_like_arg = "self"
+            args = [arg.name for arg in args_with_derivatives]
+            for i, arg in enumerate(args):
+                if is_inplace_foreach and info is not None:
+                    if arg in refargname2inplace_foreacharg:
+                        foreach_arg = refargname2inplace_foreacharg[arg]
+                        args[i] = foreach_arg.name + (
+                            "[i]" if isinstance(foreach_arg.type, ListType) else ""
+                        )
+                else:
+                    if arg == list_like_arg:
+                        args[i] = arg + "[i]"
+            setup.extend(
+                ASSIGN_VECTOR_OF_GRAD_FN.substitute(
+                    op=op,
+                    op_ctor=""
+                    if info is not None and info.has_derivatives
+                    else f'"{cpp.name(f.func)}"',
+                    args_with_derivatives=args,
+                    irange=f"{list_like_arg}.size()",
+                ).split("\n")
+            )
+        setup.extend(emit_save_inputs())
+
+        body.extend(
+            emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives)
+        )
+        declare_grad_fn_template = (
+            DECLARE_GRAD_FN if not is_inplace_foreach else DECLARE_VECTOR_OF_GRAD_FN
+        )
+        body.append(declare_grad_fn_template.substitute(op=op))
+        body.append(SETUP_DERIVATIVE.substitute(setup=setup))
+        return body
+
+    def emit_check_if_in_complex_autograd_allowlist() -> List[str]:
+        body: List[str] = []
+        if base_name in GRADIENT_IMPLEMENTED_FOR_COMPLEX:
+            return body
+        for arg in differentiable_outputs:
+            name = arg.name
+            # TODO: should be `arg.type.is_tensor_like()`?
+            if arg.cpp_type == "at::Tensor" or arg.cpp_type in TENSOR_LIST_LIKE_CTYPES:
+                body.append(f'throw_error_for_complex_autograd({name}, "{base_name}");')
+        return body
+
+    def emit_check_no_requires_grad(
+        tensor_args: List[DifferentiableInput],
+        args_with_derivatives: List[DifferentiableInput],
+    ) -> List[str]:
+        """Checks that arguments without derivatives don't require grad"""
+        body: List[str] = []
+        for arg in tensor_args:
+            if arg in args_with_derivatives:
+                continue
+            arg_name = arg.name
+            if info and arg_name in info.non_differentiable_arg_names:
+                continue
+            if arg_name == "output":
+                # Double-backwards definitions sometimes take in 'input' and
+                # 'output', but only define the derivative for input.
+                continue
+            body.append(f'check_no_requires_grad({arg_name}, "{arg_name}", "{name}");')
+        return body
+
+    def emit_original_self_definition() -> List[str]:
+        body: List[str] = []
+        if inplace:
+            if is_inplace_foreach:
+                body.append(
+                    "std::vector<c10::optional<at::Tensor>> original_selfs(self.size());"
+                )
+            else:
+                body.append("c10::optional<at::Tensor> original_self;")
+
+            all_forward_grad_cond = []
+            for derivative in fw_derivatives:
+                if derivative.required_original_self_value:
+                    all_forward_grad_cond.append(
+                        get_any_has_forward_grad_name(derivative.var_names)
+                    )
+
+            if all_forward_grad_cond:
+                if not is_inplace_foreach:
+                    body.append(f'if ({" || ".join(all_forward_grad_cond)}) {{')
+                    body.append("  original_self = self.clone();")
+                    body.append("}")
+                else:
+                    current_all_forward_grad_cond = [
+                        f"{cond}[i]" for cond in all_forward_grad_cond
+                    ]
+                    body.append("for (const auto& i : c10::irange(self.size())) {")
+                    body.append(
+                        f"  if ({' || '.join(current_all_forward_grad_cond)}) {{"
+                    )
+                    body.append("    original_selfs[i] = self[i].clone();")
+                    body.append("  }")
+                    body.append("}")
+
+        return body
+
+    def save_variables(
+        saved_variables: Sequence[SavedAttribute],
+        is_output: bool,
+        guard_for: Callable[[SavedAttribute], Optional[str]] = lambda name: None,
+    ) -> Sequence[str]:
+        # assign the saved variables to the generated grad_fn
+        stmts: List[str] = []
+        for arg in sorted(saved_variables, key=lambda sa: str(sa.nctype.name)):
+            name = (
+                arg.nctype.name.name
+                if isinstance(arg.nctype.name, SpecialArgName)
+                else arg.nctype.name
+            )
+            foreacharg: Optional[Argument] = None
+            is_foreacharg_list_type: bool = False
+            type = arg.nctype.type
+            expr = arg.expr
+            stmts_prepend = None
+            if is_inplace_foreach and info is not None:
+                # todo(crcrpar): See if we can add some check e.g. `assert foreacharg is not None`.
+                # for now the example assert would fail.
+                name_to_query = name.split("_scalar_type")[0]
+                if name_to_query in refargname2inplace_foreacharg:
+                    foreacharg = refargname2inplace_foreacharg[name_to_query]
+                    is_foreacharg_list_type = isinstance(foreacharg.type, ListType)
+                if foreacharg is not None:
+                    name_in_expr = (
+                        f"{foreacharg.name}{'[i]' if is_foreacharg_list_type else ''}"
+                    )
+                    src_name = name
+                    if "_scalar_type" in src_name:
+                        split_src_name = src_name.split("_scalar_type")
+                        assert len(split_src_name) == 2
+                        src_name = split_src_name[0]
+                    expr = expr.replace(src_name, name_in_expr)
+            if (
+                type == BaseCType(tensorT)
+                or type == OptionalCType(BaseCType(tensorT))
+                or type == MutRefCType(OptionalCType(BaseCType(tensorT)))
+                or (is_output and type == BaseCType(scalarT))
+            ):
+                # note(crcrpar): Here `expr` is generated from scratch, `arg.expr` is ignored.
+                var = name
+                name += "_"
+                if var == "self" and inplace:
+                    original_self_var = (
+                        "original_self"
+                        if not is_inplace_foreach
+                        else "original_selfs[i]"
+                    )
+                    self_var = var if not is_inplace_foreach else var + "[i]"
+                    stmts_prepend = f"if (!{original_self_var}.has_value()) {original_self_var} = {self_var}.clone()"
+                    var = f"{original_self_var}.value()"
+                    assert not is_output
+                if inplace and is_output:
+                    assert name == "result_"
+                    var = (
+                        "self[i]"
+                        if is_inplace_foreach or is_foreacharg_list_type
+                        else "self"
+                    )
+                    is_inplace_view = f"{var}.is_view()"
+                    expr = f"SavedVariable({var}, {str(is_output).lower()}, {is_inplace_view})"
+                else:
+                    expr = f"SavedVariable({var}, {str(is_output).lower()})"
+                    if foreacharg is not None and "original_selfs" not in expr:
+                        expr = expr.replace(src_name, name_in_expr)
+            elif (
+                type == BaseCType(tensorListT)
+                or type == ListCType(OptionalCType(BaseCType(tensorT)))
+                or type == BaseCType(iTensorListRefT)
+                or type == VectorCType(BaseCType(tensorT))
+            ):
+                # See Note [nuanced return type of out-of-place foreach functions]
+                if type == VectorCType(BaseCType(tensorT)):
+                    assert is_foreach and is_output
+                expr = f"make_saved_variable_list({name}, {str(is_foreach and is_output).lower()})"
+                name += "_"
+            elif type == BaseCType(intArrayRefT):
+                expr = expr + ".vec()"
+            elif type == BaseCType(symIntArrayRefT):
+                expr = expr + ".vec()"
+            elif type == BaseCType(stringT):
+                expr = f"std::string({expr})"
+            elif type == OptionalCType(BaseCType(stringT)):
+                expr = f"{expr}.has_value() ? c10::optional<std::string>(std::string({expr}.value())) : c10::nullopt"
+            elif type == ArrayRefCType(
+                elem=BaseCType(type=BaseCppType(ns="at", name="Scalar"))
+            ):
+                expr = expr + ".vec()"
+
+            guard = guard_for(arg)
+            if guard is None:
+                if stmts_prepend:
+                    stmts.append(f"{stmts_prepend};")
+                stmts.append(f"grad_fn->{name} = {expr};")
+            else:
+                stmts.append(f"if ({guard}) {{")
+                if stmts_prepend:
+                    stmts.append(f"  {stmts_prepend};")
+                stmts.append(f"  grad_fn->{name} = {expr};")
+                stmts.append("}")
+        return stmts
+
+    # Generates a Dispatcher::redispatch() call into the dispatcher. We do this mainly for performance reasons:
+    #  - Pre-compute the full DispatchKeySet. This saves the dispatcher from having to read from TLS.
+    #  - redispatch() avoids a redundant call to RecordFunction, which was already called right before
+    #    we entered this autograd kernel.
+    def emit_dispatch_call(
+        f: NativeFunction, input_base: str, unpacked_args: Sequence[str]
+    ) -> str:
+        """Dispatch call via function in a namespace or method on Tensor."""
+        dispatcher_sig = DispatcherSignature.from_schema(f.func)
+        dispatcher_exprs = dispatcher_sig.exprs()
+
+        # code-generated autograd kernels plumb and recompute dispatch keys directly through the kernel for performance.
+        # Ops also always have a function variant of the redispatch API.
+        # See Note [Plumbing Keys Through The Dispatcher] for details.
+        dispatch_key_set = "ks & c10::after_autograd_keyset"
+        call = CALL_REDISPATCH.substitute(
+            api_name=cpp.name(
+                f.func,
+                faithful_name_for_out_overloads=True,
+                symint_overload=f.func.has_symint(),
+            ),
+            unpacked_args=[dispatch_key_set] + list(unpacked_args),
+        )
+        return call
+
+    def wrap_output(
+        f: NativeFunction, unpacked_bindings: List[Binding], var: str
+    ) -> str:
+        call = ""
+        rhs_value: Optional[str] = None
+        if not any(r.type.is_tensor_like() for r in f.func.returns):
+            rhs_value = var
+        else:
+            rhs_value = f"std::move({var})"
+        assert rhs_value is not None
+        call += ASSIGN_RETURN_VALUE.substitute(
+            return_values=tie_return_values(f), rhs_value=rhs_value
+        )
+        return call
+
+    def check_tensorimpl_and_storage(
+        call: str, unpacked_bindings: List[Binding]
+    ) -> str:
+        # See NOTE [ TensorImpl and Storage Pointer Sanity Checks ]
+        stmts_before_call: List[str] = []
+        stmts_after_call: List[str] = []
+
+        if cpp.name(f.func) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE:
+            return call
+
+        # Check properties of inputs (enforce (1))
+        for unpacked_binding in unpacked_bindings:
+            arg = unpacked_binding.name
+            noref_cpp_type = unpacked_binding.nctype.type.remove_const_ref()
+            if noref_cpp_type == BaseCType(tensorListT) or noref_cpp_type == BaseCType(
+                iTensorListRefT
+            ):
+                stmts_before_call += [
+                    SAVE_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                    SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg),
+                ]
+                stmts_after_call += [
+                    ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                    ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg),
+                ]
+            elif noref_cpp_type == ListCType(OptionalCType(BaseCType(tensorT))):
+                stmts_before_call += [
+                    SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                    SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg),
+                ]
+                stmts_after_call += [
+                    ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute(
+                        tensorlist_name=arg
+                    ),
+                    ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute(
+                        tensorlist_name=arg
+                    ),
+                ]
+            elif noref_cpp_type == BaseCType(tensorT):
+                stmts_before_call += [
+                    SAVE_TENSOR_STORAGE.substitute(tensor_name=arg),
+                    SAVE_TENSOR_IMPL.substitute(tensor_name=arg),
+                ]
+                stmts_after_call += [
+                    ENFORCE_SAME_TENSOR_STORAGE.substitute(
+                        tensor_name=arg, out_tensor_name=arg
+                    ),
+                    ENFORCE_SAME_TENSOR_IMPL.substitute(tensor_name=arg),
+                ]
+
+        assert (stmts_before_call and stmts_after_call) or (
+            not stmts_before_call and not stmts_after_call
+        )
+
+        # Check properties of outputs (enforce (2), (3))
+        if f.func.kind() not in (SchemaKind.inplace, SchemaKind.out):
+            base_name = f.func.name.name.base  # TODO: should be str(f.func.name.name)?
+            aliased_arg_name = ALL_VIEW_FUNCTIONS.get(base_name, None)
+            if aliased_arg_name is not None:
+                aliased_arg_name = unpacked_name(aliased_arg_name)
+            for i, (ret, ret_name) in enumerate(
+                zip(f.func.returns, cpp.return_names(f))
+            ):
+                noref_cpp_type = cpp.return_type(ret, symint=True).remove_const_ref()
+                if noref_cpp_type == BaseCType(tensorT):
+                    if aliased_arg_name is not None:
+                        assert (
+                            i == 0
+                        ), "Expect non-CompositeImplicitAutograd view function {base} to return single output"
+                        stmts_after_call += [
+                            ENFORCE_SAME_TENSOR_STORAGE.substitute(
+                                tensor_name=aliased_arg_name, out_tensor_name=ret_name
+                            )
+                        ]
+                    else:
+                        if (
+                            type_wrapper_name(f)
+                            not in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT
+                        ):
+                            stmts_after_call += [
+                                ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE.substitute(
+                                    tensor_name=ret_name, fn_name=type_wrapper_name(f)
+                                )
+                            ]
+
+                    if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
+                        stmts_after_call += [
+                            ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute(
+                                tensor_name=ret_name, fn_name=type_wrapper_name(f)
+                            )
+                        ]
+
+                # Currently we don't have any functions that return the following types, but
+                # we should update the checks once we do
+                elif noref_cpp_type == ListCType(OptionalCType(BaseCType(tensorT))):
+                    raise AssertionError(
+                        f"Please add use_count checks for {noref_cpp_type}"
+                    )
+                elif noref_cpp_type == BaseCType(tensorListT):
+                    raise AssertionError(
+                        f"Please add use_count checks for {noref_cpp_type}"
+                    )
+
+        if stmts_before_call and stmts_after_call:
+            call = (
+                RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_before_call)
+                + call
+                + RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_after_call)
+            )
+        return call
+
+    def emit_call(
+        f: NativeFunction, unpacked_bindings: List[Binding], try_jit_decomposition: bool
+    ) -> str:
+        # We only care about adding `at::AutoDispatchBelowAutograd` guard for non-variable dispatch
+        # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure
+        # the baseType operations still dispatch to non-Variable type, even if the arguments passed
+        # in are now Variables.
+        # See NOTE [ Treating Variables as non-Variables in type dispatch ] for details.
+        unpacked_args = [b.name for b in unpacked_bindings]
+        base_type_call = emit_dispatch_call(f, "self_", unpacked_args)
+
+        if get_view_info(f) is not None or modifies_arguments(f):
+            guard = "at::AutoDispatchBelowAutograd guard;"
+        else:
+            guard = "at::AutoDispatchBelowADInplaceOrView guard;"
+
+        any_has_forward_grad = (
+            get_any_has_fw_grad_cond(derivative=None)
+            if requires_derivative
+            else "false"
+        )
+        return_types = ", ".join(
+            [cpp.return_type(a, symint=True).cpp_type() for a in f.func.returns]
+        )
+        if len(f.func.returns) > 1:
+            return_types = f"std::tuple<{return_types}>"
+
+        arg_names = [
+            a.name
+            for a in cpp.arguments(
+                f.func.arguments,
+                faithful=True,
+                symint=True,
+                method=False,
+                cpp_no_default_args=set(),
+            )
+        ]
+
+        if not modifies_arguments(f) and not returns_void:
+            if try_jit_decomposition:
+                call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES_JVP_DECOMP.substitute(
+                    base_type_call=base_type_call,
+                    tmp_var=TMP_VAR,
+                    guard=guard,
+                    any_has_forward_grad=any_has_forward_grad,
+                    op_name=cpp.name(f.func),
+                    op_overload=f.func.name.overload_name,
+                    return_types=return_types,
+                    arg_names=arg_names,
+                )
+            else:
+                call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute(
+                    base_type_call=base_type_call,
+                    tmp_var=TMP_VAR,
+                    guard=guard,
+                )
+
+            call += wrap_output(f, unpacked_bindings, TMP_VAR)
+        else:
+            assert not try_jit_decomposition
+            call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute(
+                base_type_call=base_type_call, guard=guard
+            )
+        call = check_tensorimpl_and_storage(call, unpacked_bindings)
+        return call
+
+    def emit_history() -> str:
+        fn = "rebase" if modifies_arguments(f) and view_info is None else "set"
+        output_names = [r.name for r in differentiable_outputs]
+        # TODO: flatten allocates a std::vector, which could be expensive
+        outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(
+            outs=output_names if not is_inplace_foreach else "self"
+        )
+        if not is_inplace_foreach:
+            return SET_HISTORY.substitute(fn=fn, differentiable_outputs=outs)
+        else:
+            return LOOP_OVER_VECTOR_OF_GRAD_FNS.substitute(
+                preamble=(
+                    f"auto differentiable_outputs = {outs};\n"
+                    f"TORCH_INTERNAL_ASSERT(differentiable_outputs.size() == grad_fns.size());"
+                ),
+                statements=f"{fn}_history(differentiable_outputs[i], grad_fns[i]);",
+            )
+
+    def emit_save_outputs() -> str:
+        if is_out_fn:
+            # out functions don't currently support differentiation
+            return ""
+        if info is not None and info.has_derivatives:
+            stmts = save_variables(info.all_saved_outputs, True)
+            if len(stmts) == 0:
+                return ""
+            if not is_inplace_foreach:
+                return CONDITIONAL.substitute(cond="grad_fn", statements=stmts)
+            else:
+                return LOOP_OVER_VECTOR_OF_GRAD_FNS.substitute(
+                    preamble="", statements=stmts
+                )
+        return ""
+
+    def emit_any_requires_grad() -> List[str]:
+        extra_condition = ""
+        if info and info.output_differentiability_conditions:
+            assert len(info.output_differentiability_conditions) == 1
+            extra_condition = f"_any_requires_grad &= ({info.output_differentiability_conditions[0]});"
+        names_of_args_with_derivatives = [arg.name for arg in args_with_derivatives]
+        if is_inplace_foreach and info is not None:
+            for i, arg in enumerate(names_of_args_with_derivatives):
+                for f_arg, r_arg in inplace_foreacharg2refarg.items():
+                    if arg == r_arg.name:
+                        names_of_args_with_derivatives[i] = f_arg.name
+        return [
+            SETUP_ANY_REQUIRES_GRAD.substitute(
+                args_with_derivatives=names_of_args_with_derivatives,
+                extra_differentiability_conditions=extra_condition,
+            )
+        ]
+
+    def get_any_has_forward_grad_name(var_names: Tuple[str, ...]) -> str:
+        if len(var_names) == 1:
+            return f"_any_has_forward_grad_{var_names[0]}"
+        else:
+            return f'_any_has_forward_grad_{"_".join(var_names)}'
+
+    def emit_any_has_forward_grad() -> List[str]:
+        content: List[str] = []
+        if not is_foreach:
+            for derivative in fw_derivatives:
+                requires_fw_grad = get_any_has_fw_grad_cond(derivative=derivative)
+                if info and info.output_differentiability_conditions:
+                    assert len(info.output_differentiability_conditions) == 1
+                    requires_fw_grad = f"({info.output_differentiability_conditions[0]}) && {requires_fw_grad}"
+                content.append(
+                    f"[[maybe_unused]] auto {get_any_has_forward_grad_name(derivative.var_names)} = {requires_fw_grad};"
+                )
+        else:
+            for derivative in fw_derivatives:
+                bool_vector_name = get_any_has_forward_grad_name(derivative.var_names)
+                cur_derivative_conditions = []
+                for inp in differentiable_inputs:
+                    if derivative.required_inputs_fw_grad is None:
+                        continue
+                    if inp.name not in derivative.required_inputs_fw_grad:
+                        continue
+                    inp_name = (
+                        inp.name
+                        if not inplace
+                        else refargname2inplace_foreacharg[inp.name].name
+                    )
+                    inp_type = (
+                        inp.type
+                        if not inplace
+                        else refargname2inplace_foreacharg[inp.name].type
+                    )
+                    is_list_type = is_tensor_list_type(inp_type)
+                    if is_list_type:
+                        if inp_name != "self":
+                            content.append(
+                                FW_DERIVATIVE_SIZE_CHECK_TEMPLATE.substitute(
+                                    inp_name=inp_name
+                                )
+                            )
+                        cur_derivative_conditions.append(
+                            FW_DERIVATIVE_CHECK_TEMPLATE.substitute(
+                                req_inp=inp_name + "[i]"
+                            )
+                        )
+                    else:
+                        cur_derivative_conditions.append(
+                            FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp_name)
+                        )
+
+                content.append(f"std::vector<bool> {bool_vector_name}(self.size());")
+                content.append("for (const auto& i : c10::irange(self.size())) {")
+                content.append(
+                    f"  {bool_vector_name}[i] = {' || '.join(cur_derivative_conditions)};"
+                )
+                content.append("}")
+        return content
+
+    def emit_check_inplace() -> List[str]:
+        if not inplace:
+            return []
+        return [
+            f"check_inplace({arg.name}, _any_requires_grad);"
+            for arg in differentiable_outputs
+        ]
+
+    def emit_fw_derivatives() -> List[str]:
+        content: List[str] = []
+        fw_grad_setters: List[str] = []
+        for derivative in fw_derivatives:
+            res = derivative.var_names
+            if f.func.name.name.inplace:
+                assert (
+                    len(res) == 1
+                ), "Expected number of outputs to be 1 if function is inplace"
+                # TODO update this when inplace namings are unified
+                res = ("self",)
+
+            assert derivative.required_inputs_fw_grad is not None
+
+            unpacked_arguments = ""
+            for inp in differentiable_inputs:
+                inp_name = inp.name
+                is_input_tensorlist = is_foreach and is_tensor_list_type(
+                    inp.type
+                    if not inplace
+                    else refargname2inplace_foreacharg[inp.name].type
+                )
+                input_suffix = "[i]" if is_input_tensorlist else ""
+                if is_inplace_foreach:
+                    if inp.name in refargname2inplace_foreacharg:
+                        inp_name = refargname2inplace_foreacharg[inp.name].name
+                zeros_fn = (
+                    "zeros"
+                    if inplace and inp.name == "self"
+                    else "_efficientzerotensor"
+                )
+                if inp.name in derivative.required_inputs_fw_grad:
+                    unpacked_arguments += (
+                        FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(
+                            inp_name=inp.name,
+                            inp=inp_name + input_suffix,
+                            zeros_fn=zeros_fn,
+                        )
+                    )
+                if inp.name in (derivative.required_inputs_primal or []):
+                    unpacked_arguments += (
+                        FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(
+                            inp_name=inp.name,
+                            inp=inp_name + input_suffix,
+                        )
+                    )
+            if derivative.required_original_self_value:
+                input_suffix = "s[i]" if is_inplace_foreach else ""
+                unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(
+                    inp_name="original_self",
+                    inp="original_self" + input_suffix,
+                    zeros_fn=zeros_fn,
+                )
+                unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(
+                    inp_name="original_self",
+                    inp="original_self" + input_suffix,
+                )
+            elif inplace and derivative.is_reusing_outplace_formula:
+                # The gradient wasn't already cloned, do it if grad mode is enabled
+                unpacked_arguments += (
+                    "self_t = GradMode::is_enabled() ? self_t.clone() : self_t;"
+                )
+
+            if inplace:
+                is_inplace_str = "true"
+            else:
+                is_inplace_str = "false"
+
+            requires_fw_grad = get_any_has_forward_grad_name(derivative.var_names)
+
+            if all(
+                (isinstance(var_type, BaseType) and var_type.is_tensor_like())
+                for var_type in derivative.var_types
+            ):
+                # Is there a way to get from BaseType to BaseCType
+                if len(derivative.var_types) == 1:
+                    opt_res_grad_type = OptionalCType(BaseCType(tensorT)).cpp_type()
+                    if not is_foreach:
+                        fw_grad_setters.append(
+                            FW_DERIVATIVE_SETTER_TENSOR.substitute(
+                                out_arg=res[0], is_inplace=is_inplace_str
+                            )
+                        )
+                    else:
+                        assert res[0] == ("result" if not inplace else "self")
+                        fw_grad_setters.append(
+                            FW_DERIVATIVE_SETTER_TENSOR_FOREACH.substitute(
+                                out_arg=res[0], is_inplace=is_inplace_str
+                            )
+                        )
+                    requires_fw_grad += f" && ({derivative.var_names[0]}.defined())"
+                else:
+                    tuple_type = TupleCType(
+                        [BaseCType(tensorT)] * len(derivative.var_types)
+                    )
+                    opt_res_grad_type = OptionalCType(tuple_type).cpp_type()
+                    for idx, single_res in enumerate(res):
+                        fw_grad_setters.append(
+                            FW_DERIVATIVE_SETTER_MULTI_OUTPUT.substitute(
+                                idx=idx, all_res="_".join(res), out_arg=single_res
+                            )
+                        )
+            elif (
+                isinstance(derivative.var_types[0], ListType)
+                and derivative.var_types[0].is_tensor_like()
+            ):
+                assert (
+                    len(derivative.var_types) == 1
+                ), "Expected number of outputs to be 1 if function returns ListType"
+                if not is_foreach:
+                    opt_res_grad_type = OptionalCType(
+                        VectorCType(BaseCType(tensorT))
+                    ).cpp_type()
+                    fw_grad_setters.append(
+                        FW_DERIVATIVE_SETTER_TENSOR_LIST.substitute(
+                            out_arg=res[0], is_inplace=is_inplace_str
+                        )
+                    )
+                else:
+                    # TODO(crcrpar): Should this (= the foreach specific logic) be refactored somehow?
+                    # Only out-place foreach functions that have entries in `tools/autograd/derivatives.yaml`
+                    # can reach here.
+                    opt_res_grad_type = OptionalCType(BaseCType(tensorT)).cpp_type()
+                    fw_grad_setters.append(
+                        FW_DERIVATIVE_SETTER_TENSOR_FOREACH.substitute(
+                            out_arg=res[0], is_inplace=is_inplace_str
+                        )
+                    )
+            else:
+                raise RuntimeError("Unsupported output type for forward derivative")
+
+            if not is_foreach:
+                fw_grad_opt_definition = f"{opt_res_grad_type} {'_'.join(res)}_new_fw_grad_opt = c10::nullopt;"
+                # View ops create fw_grad that already is a view of the base's fw_grad so just use that
+                content.append(
+                    FW_DERIVATIVE_TEMPLATE.substitute(
+                        fw_grad_opt_definition=fw_grad_opt_definition,
+                        requires_fw_grad=requires_fw_grad,
+                        formula=derivative.formula,
+                        out_arg="_".join(res),
+                        unpacked_arguments=unpacked_arguments,
+                    )
+                )
+            else:
+                # note(crcrpar): Assuming `self` is TensorList.
+                fw_grad_opt_definition = (
+                    f"std::vector<{opt_res_grad_type}> {'_'.join(res)}_new_fw_grad_opts"
+                    "(self.size(), c10::nullopt);"
+                )
+                foreach_forward_grad_formula = derivative.formula
+                _foreach_arg: Union[Argument, DifferentiableInput]
+                if inplace:
+                    for _foreach_arg, _ref_arg in inplace_foreacharg2refarg.items():
+                        # note(crcrpar): Massage only Scalar and ArrayRef<Scalar> here.
+                        if not (
+                            is_tensor_type(_foreach_arg.type)
+                            or is_tensor_list_type(_foreach_arg.type)
+                        ):
+                            pattern = _foreach_arg.name
+                            if isinstance(_foreach_arg.type, ListType):
+                                pattern += "[i]"
+                            foreach_forward_grad_formula = (
+                                foreach_forward_grad_formula.replace(
+                                    _ref_arg.name, pattern
+                                )
+                            )
+                else:
+                    if (
+                        "result" in foreach_forward_grad_formula
+                        and "result[i]" not in foreach_forward_grad_formula
+                    ):
+                        foreach_forward_grad_formula = (
+                            foreach_forward_grad_formula.replace("result", "result[i]")
+                        )
+
+                content.append(
+                    FW_DERIVATIVE_FOREACH_TEMPLATE.substitute(
+                        fw_grad_opt_definition=fw_grad_opt_definition,
+                        vector_of_optional_tensor=f"{'_'.join(res)}_new_fw_grad_opts",
+                        any_has_forward_grad_for_current_index=" || ".join(
+                            get_any_has_forward_grad_name(derivative.var_names) + "[i]"
+                            for derivative in fw_derivatives
+                        ),
+                        formula=foreach_forward_grad_formula,
+                        unpacked_arguments=unpacked_arguments,
+                    )
+                )
+
+        # Set all the grads at the end to avoid: https://github.com/pytorch/pytorch/issues/67367
+        content.append("\n".join(fw_grad_setters))
+        return content
+
+    def get_any_has_fw_grad_cond(derivative: Optional[ForwardDerivative]) -> str:
+        #
+        # Produces a condition string (e.g, "isFwGradDefined(grad_output) || isFwGradDefined(output)")
+        #
+        if derivative is None:
+            # (1) If a derivative is NOT provided, cond will check fw_grad of ALL differentiable inputs
+            # - Used in the out_fn case when we want to forbid fw derivatives
+            # - Used in the case where the fw_derivative is not defined, but we want
+            #   To check if there is a decomposition registered for jvp
+            to_check: List[str] = []
+            for inp in list(
+                mapMaybe(
+                    gen_differentiable_input,
+                    f.func.arguments.non_out + list(f.func.arguments.out),  # type: ignore[operator]
+                )
+            ):
+                if is_tensor_type(inp.type):
+                    to_check.append(
+                        FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
+                    )
+                elif is_tensor_list_type(inp.type):
+                    to_check.append(
+                        FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE.substitute(
+                            req_inp=inp.name
+                        )
+                    )
+                else:
+                    raise RuntimeError(
+                        f'Unsupported input type for "{name}" when forbidding forward AD usage.'
+                    )
+            return f'({" || ".join(to_check)})'
+        else:
+            # (2) If derivative is provided, use that information to determine which inputs
+            #     to check fw_grad for
+            assert derivative.required_inputs_fw_grad is not None
+
+            if len(derivative.required_inputs_fw_grad) == 0:
+                # Handle functions like stack
+                # For these, we don't unpack anything and always call the user function
+                if not (
+                    len(differentiable_inputs) == 1
+                    and is_tensor_list_type(differentiable_inputs[0].type)
+                ):
+                    raise RuntimeError(
+                        f'No differentiable input to "{name}" is a differentiable Tensor (as the provided '
+                        "forward AD formula does not use any input tangent) even though a forward gradient "
+                        "formula has been defined for it. This case should only happen for function that "
+                        "take a single TensorList as input. All other cases are not supported right now."
+                    )
+                any_has_fw_grad = "true"
+            else:
+                any_has_fw_grad = " || ".join(
+                    [
+                        (
+                            FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE
+                            if is_tensor_list_type(inp.type)
+                            else FW_DERIVATIVE_CHECK_TEMPLATE
+                        ).substitute(req_inp=inp.name)
+                        for inp in differentiable_inputs
+                        if inp.name in derivative.required_inputs_fw_grad
+                    ]
+                )
+                any_has_fw_grad = f"({any_has_fw_grad})"
+
+            return any_has_fw_grad
+
+    def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
+        if is_out_fn:
+            msg = "because it is an out= function"
+        else:
+            msg = (
+                "because it has not been implemented yet.\\nPlease file an issue "
+                "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
+                "so that we can prioritize its implementation."
+            )
+        cond = get_any_has_fw_grad_cond(derivative=None)
+        return (
+            FW_DERIVATIVE_FORBID_TEMPLATE.substitute(cond=cond, name=name, msg=msg)
+            if cond != ""
+            else ""
+        )
+
+    body: List[str] = []
+    unpack_args_stats, unpacked_bindings = unpack_args(f)
+
+    body.extend(unpack_args_stats)
+    if requires_derivative:
+        body.extend(emit_any_requires_grad())
+        body.extend(emit_any_has_forward_grad())
+        body.extend(emit_check_inplace())
+        body.extend(emit_original_self_definition())
+        body.extend(setup_derivative(differentiable_inputs))
+
+    body.append(emit_call(f, unpacked_bindings, try_jit_decomposition))
+    if requires_derivative:
+        # set_flags has to appear after version_counter, because rebase_history
+        # requires that the counter is incremented before it is called
+        body.append(emit_history())
+        body.extend(emit_check_if_in_complex_autograd_allowlist())
+
+    if is_out_fn:
+        body.append(emit_forbid_fw_derivatives(is_out_fn=True))
+    else:
+        if requires_derivative and not try_jit_decomposition:
+            if len(fw_derivatives) > 0:
+                body.extend(emit_fw_derivatives())
+            else:
+                body.append(emit_forbid_fw_derivatives())
+
+    if requires_derivative:
+        # Save only after the forward AD has been set up
+        body.append(emit_save_outputs())
+
+    if str(f.func.name.name) in RESET_GRAD_ACCUMULATOR:
+        # `inplace` implies that there is exactly one output named `self`,
+        # so we can keep the generated code easy. If you need to
+        # `reset_grad_accumulator` in an operator that's not `inplace`, you can
+        # remove this assert but the code generation will get more elaborate
+        assert inplace
+        body.append("reset_grad_accumulator(self);")
+    if not returns_void:
+        body.append(f"return {get_return_value(f)};")
+    return body
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_view_funcs.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_view_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..c66ffbe34d7126d142d0b6d999820a08c4f32207
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/gen_view_funcs.py
@@ -0,0 +1,334 @@
+# Generates ViewFuncs.h/cpp
+#
+# NOTE: If any changes are being made to the ViewFunc codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+# The fallback is expected to mimic this codegen, so we should keep the two in sync.
+
+from typing import List, Tuple
+
+import torchgen.api.dispatcher as dispatcher
+from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    NamedCType,
+    SymIntT,
+    tensorT,
+    VectorCType,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.model import Argument, NativeFunction, OptionalType
+from torchgen.utils import FileManager
+
+from .gen_inplace_or_view_type import (
+    CALL_DISPATCH,
+    extract_bindings,
+    get_view_info,
+    modifies_arguments,
+    use_derived,
+)
+
+FUNCTION_DECLARATION = CodeTemplate(
+    """\
+#define ${uppercase_op}_AVAILABLE
+struct ${op} : public ${superclass} {
+  ${op}(${constructor_args}) ${initializer_list}
+  {};
+  virtual ~${op}() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ${state}
+};
+
+"""
+)
+
+FUNCTION_DEFINITION = CodeTemplate(
+    """\
+std::vector<c10::SymInt> ${op}::get_symints() const {
+  ${get_symints}
+}
+
+size_t ${op}::num_symints() const {
+  return static_cast<size_t>(${num_symints});
+}
+
+void ${op}::set_symints(std::vector<c10::SymInt> ${symints_vec}) {
+  TORCH_INTERNAL_ASSERT(${symints_vec}.size() == num_symints());
+  ${set_symints}
+}
+
+std::vector<at::Tensor> ${op}::get_tensors() const {
+  ${get_tensors}
+}
+
+size_t ${op}::num_tensors() const {
+  return static_cast<size_t>(${num_tensors});
+}
+
+void ${op}::set_tensors(std::vector<at::Tensor> ${tensors_vec}) {
+  TORCH_INTERNAL_ASSERT(${tensors_vec}.size() == num_tensors());
+  ${set_tensors}
+}
+
+at::Tensor ${op}::operator()(const at::Tensor& ${call_input_name}) const {
+  return ${op_call};
+}
+
+std::unique_ptr<ViewFunc> ${op}::clone_and_set(
+    std::optional<std::vector<c10::SymInt>> ${symints_vec},
+    std::optional<std::vector<at::Tensor>> ${tensors_vec}) const {
+  auto output = std::make_unique<${op}>(${clone_args});
+  if (${symints_vec}.has_value()) {
+    output->set_symints(std::move(*(${symints_vec})));
+  }
+  if (${tensors_vec}.has_value()) {
+    output->set_tensors(std::move(*(${tensors_vec})));
+  }
+  return output;
+}
+
+"""
+)
+
+
+# e.g. as_strided -> AsStridedViewFunc for camel case or
+# as_strided_view_func otherwise
+def view_func_name(
+    f: NativeFunction, include_namespace: bool = False, camel_case: bool = True
+) -> str:
+    name = f.func.name.unambiguous_name()
+    view_func_name = f"{name.replace('.', '_')}_view_func"
+    if camel_case:
+        is_private = view_func_name.startswith("_")
+        view_func_name = "".join(
+            [p.title() for p in view_func_name.replace(".", "_").split("_")]
+        )
+        if is_private:
+            # put the leading underscore back in
+            view_func_name = f"_{view_func_name}"
+    namespace = "torch::autograd::generated::" if include_namespace else ""
+    return f"{namespace}{view_func_name}"
+
+
+def is_symint_or_tensor(arg: Argument) -> bool:
+    return arg.type.is_tensor_like() or arg.type.is_symint_like()
+
+
+def remove_const_ref(binding: Binding) -> Binding:
+    return Binding(
+        name=binding.name,
+        nctype=binding.nctype.remove_const_ref(),
+        argument=binding.argument,
+        default=binding.default,
+    )
+
+
+def returns_multi_tensor(fn: NativeFunction) -> bool:
+    returns = fn.func.returns
+    assert len(returns) == 1
+    returns_list_like = returns[0].type.is_list_like() is not None
+    returns_tensor_like = returns[0].type.is_tensor_like()
+    return returns_list_like and returns_tensor_like
+
+
+# Generates strings with logic for getting / setting state of a particular type.
+#
+# Args:
+#   bindings (list): List of state bindings of interest (may be empty)
+#   state_vec_type (NamedCType): Type of vector to either return or copy from
+#
+# Returns:
+#   tuple: (list of getter logic strings, list of setter logic strings, string
+#     with num items expression)
+def generate_state_getter_setter(
+    bindings: List[Binding],
+    state_vec_type: NamedCType,
+) -> Tuple[List[str], List[str], str]:
+    getter_logic = []
+    setter_logic = []
+
+    state_vec = state_vec_type.name
+    getter_logic.append(f"{state_vec_type.cpp_type()} {state_vec};")
+    if len(bindings) > 0:
+        setter_logic.append("auto i = 0;")
+
+    num_exprs = []
+    for i, b in enumerate(bindings):
+        assert isinstance(b.argument, Argument)
+        if b.argument.type.is_list_like():
+            # Handle list-likes.
+            num_expr = f"{b.name}.size()"
+            num_exprs.append(num_expr)
+            getter = f"{state_vec}.insert({state_vec}.end(), {b.name}.begin(), {b.name}.end());"
+            setter = f"std::copy({state_vec}.begin() + i, {state_vec}.begin() + i + {b.name}.size(), {b.name}.begin());"
+        elif isinstance(b.argument.type, OptionalType):
+            # Handle optionals.
+            num_expr = f"({b.name}.has_value() ? 1 : 0)"
+            num_exprs.append(num_expr)
+            conditional = f"if({b.name}.has_value())"
+            getter = (
+                f"{conditional} {state_vec}.insert({state_vec}.end(), *({b.name}));"
+            )
+            setter = f"{conditional} {b.name} = {state_vec}[i];"
+        else:
+            num_expr = "1"
+            num_exprs.append(num_expr)
+            getter = f"{state_vec}.push_back({b.name});"
+            setter = f"{b.name} = {state_vec}[i];"
+
+        getter_logic.append(getter)
+        setter_logic.append(setter)
+        if i < len(bindings) - 1:
+            setter_logic.append(f"i += {num_expr};")
+
+    # Reserve / assert based on the total number of items expression.
+    num_items = "0" if len(num_exprs) == 0 else " + ".join(num_exprs)
+    if len(bindings) > 0:
+        getter_logic.insert(1, f"{state_vec}.reserve({num_items});")
+
+    getter_logic.append(f"return {state_vec};")
+
+    return getter_logic, setter_logic, num_items
+
+
+def process_function(fn: NativeFunction, template: CodeTemplate) -> str:
+    bindings = extract_bindings(fn)
+    non_self_bindings = [b for b in bindings if b.name != "self"]
+
+    non_self_args = fn.func.arguments.flat_all[1:]
+    non_self_value_bindings = [
+        dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args
+    ]
+
+    # Generate constructor / clone args for the generated struct.
+    constructor_args = [b.defn() for b in non_self_bindings]
+    clone_args = [b.name for b in non_self_bindings]
+
+    # Generate state variable declarations for the generated struct.
+    state_variables = [
+        f"{remove_const_ref(b).defn()};" for b in non_self_value_bindings
+    ]
+
+    # Generate initializer list expressions for the generated struct.
+    # allow_expensive_conversions=True because we need to store e.g. SymIntArrayRefs as
+    # vector<SymInt>s.
+    init_exprs = translate(
+        non_self_bindings, non_self_value_bindings, allow_expensive_conversions=True
+    )
+    initializers = []
+    for b, init_expr in zip(non_self_bindings, init_exprs):
+        name = b.nctype.name
+        assert isinstance(name, str)
+        initializers.append(f"{name}({init_expr.expr})")
+
+    # Generate call to underlying view op
+    call_input_name = "input_base"
+    op_call_args = [call_input_name, *(b.name for b in non_self_bindings)]
+    op_call = CALL_DISPATCH.substitute(
+        unambiguous_name=fn.func.name.unambiguous_name(),
+        unpacked_args=op_call_args,
+    )
+
+    # Multi-output views additionally require a view_idx for disambiguation.
+    if returns_multi_tensor(fn):
+        view_idx_name = "view_idx"
+        view_idx_typename = "int64_t"
+        view_idx_decl = f"{view_idx_typename} {view_idx_name}"
+        constructor_args.append(view_idx_decl)
+        clone_args.append(view_idx_name)
+        state_variables.append(f"{view_idx_decl};")
+        initializers.append(f"{view_idx_name}({view_idx_name})")
+        op_call += f"[{view_idx_name}]"
+
+    # Generate initializer list for the generated struct.
+    initializer_list = f": {', '.join(initializers)}" if len(initializers) > 0 else ""
+
+    # Generate getter / setter logic for any symints.
+    symint_bindings = [
+        b
+        for b in non_self_bindings
+        if isinstance(b.argument, Argument) and b.argument.type.is_symint_like()
+    ]
+    symints_vec_type = NamedCType("symints", VectorCType(BaseCType(SymIntT)))
+    get_symints, set_symints, num_symints = generate_state_getter_setter(
+        symint_bindings, symints_vec_type
+    )
+
+    # Generate getter / setter logic for any tensors.
+    tensor_bindings = [
+        b
+        for b in non_self_bindings
+        if isinstance(b.argument, Argument) and b.argument.type.is_tensor_like()
+    ]
+    tensors_vec_type = NamedCType("tensors", VectorCType(BaseCType(tensorT)))
+    get_tensors, set_tensors, num_tensors = generate_state_getter_setter(
+        tensor_bindings, tensors_vec_type
+    )
+
+    return template.substitute(
+        op=view_func_name(fn),
+        uppercase_op=view_func_name(fn, camel_case=False).upper(),
+        superclass="torch::autograd::ViewFunc",
+        initializer_list=initializer_list,
+        state=state_variables,
+        constructor_args=constructor_args,
+        clone_args=clone_args,
+        symints_vec=symints_vec_type.name,
+        get_symints=get_symints,
+        set_symints=set_symints,
+        num_symints=num_symints,
+        tensors_vec=tensors_vec_type.name,
+        get_tensors=get_tensors,
+        set_tensors=set_tensors,
+        num_tensors=num_tensors,
+        call_input_name=call_input_name,
+        op_call=op_call,
+    )
+
+
+def gen_view_funcs(
+    out: str,
+    fns_with_infos: List[NativeFunctionWithDifferentiabilityInfo],
+    template_path: str,
+) -> None:
+    # don't need the info parts, just the function
+    fns = [fn.func for fn in fns_with_infos if use_derived(fn)]
+    # only want out-of-place views
+    view_fns = [
+        fn for fn in fns if get_view_info(fn) is not None and not modifies_arguments(fn)
+    ]
+
+    declarations = [process_function(fn, FUNCTION_DECLARATION) for fn in view_fns]
+    definitions = [process_function(fn, FUNCTION_DEFINITION) for fn in view_fns]
+    ops_headers = [f"#include <ATen/ops/{fn.root_name}_ops.h>" for fn in view_fns]
+
+    file_basename = "ViewFuncs"
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    for suffix in [".h", ".cpp"]:
+        fname = file_basename + suffix
+        fm.write_with_template(
+            fname,
+            fname,
+            lambda: {
+                "generated_comment": "@"
+                + f"generated from {fm.template_dir_for_comments()}/"
+                + fname,
+                "view_func_declarations": declarations,
+                "view_func_definitions": definitions,
+                "ops_headers": ops_headers,
+            },
+        )
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/load_derivatives.py b/MLPY/Lib/site-packages/torchgen/packaged/autograd/load_derivatives.py
new file mode 100644
index 0000000000000000000000000000000000000000..361408b1b70dcec443ce285c930078574e94de6d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/load_derivatives.py
@@ -0,0 +1,1013 @@
+# Parses derivatives.yaml into autograd functions
+#
+# Each autograd function is represented by `DifferentiabilityInfo` containing
+# a list of `Derivative`. See `torchgen.api.autograd` for the data models.
+import re
+from collections import defaultdict
+from typing import Any, Counter, Dict, List, Match, Optional, Sequence, Set, Tuple
+
+import yaml
+from torchgen.api import cpp
+
+from torchgen.api.autograd import (
+    Derivative,
+    DifferentiabilityInfo,
+    ForwardDerivative,
+    SavedAttribute,
+)
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    boolT,
+    CppSignatureGroup,
+    layoutT,
+    longT,
+    NamedCType,
+    OptionalCType,
+    scalarTypeT,
+    SpecialArgName,
+    stringT,
+    symIntArrayRefT,
+    SymIntT,
+    tensorGeometryT,
+    tensorOptionsT,
+    typeAndSizeT,
+    VectorCType,
+)
+from torchgen.context import with_native_function
+from torchgen.gen import get_grouped_by_view_native_functions, parse_native_yaml
+from torchgen.model import (
+    AUTOGRAD_KEYS,
+    FunctionSchema,
+    NativeFunction,
+    NativeFunctionsViewGroup,
+    OperatorName,
+    SchemaKind,
+    Type,
+    Variant,
+)
+from torchgen.utils import concatMap, IDENT_REGEX, split_name_params
+from torchgen.yaml_utils import YamlLoader
+
+DerivativeRet = Tuple[Dict[FunctionSchema, Dict[str, DifferentiabilityInfo]], Set[str]]
+
+_GLOBAL_LOAD_DERIVATIVE_CACHE: Dict[Tuple[str, str], DerivativeRet] = {}
+
+_VALID_AUTOGRAD_KEYS = set(AUTOGRAD_KEYS)
+
+
+# This function directly adds per-dispatchkey derivative entries for {view}_copy variants of each view op.
+# Since every {view} and {view}_copy op shares the same derivative formula,
+# we generate them here instead of duplicating them in the yaml.
+# See Note [Codegen'd {view}_copy Operators]
+def add_view_copy_derivatives(
+    infos: Dict[FunctionSchema, Dict[str, DifferentiabilityInfo]],
+    view_groups: List[NativeFunctionsViewGroup],
+) -> None:
+    # Get the map from each view op's name to its corresponding view group
+    view_name_to_group: Dict[OperatorName, NativeFunctionsViewGroup] = {
+        g.view.func.name: g for g in view_groups
+    }
+
+    view_infos = {}
+
+    for info_dispatch_dict in infos.values():
+        # maybe_view_group only needs to be calculated once per info_dispatch_dict
+        maybe_view_group = None
+        view_copy_differentiability_infos = {}
+        for dispatch_key, info in info_dispatch_dict.items():
+            maybe_view_group = view_name_to_group.get(info.func.func.name, None)
+            if maybe_view_group is not None and maybe_view_group.view_copy is not None:
+                view_copy_info = info.create_view_copy_from_view_derivative(
+                    maybe_view_group
+                )
+                if view_copy_info is not None:
+                    fn_schema = view_copy_info.func.func
+                    view_copy_differentiability_infos[dispatch_key] = view_copy_info
+            else:
+                break
+        # prefer manually-defined derivatives if any
+        if len(view_copy_differentiability_infos) > 0 and fn_schema not in infos:
+            assert fn_schema is not None
+            view_infos[fn_schema] = view_copy_differentiability_infos
+
+    infos.update(view_infos)
+
+
+def load_derivatives(
+    derivatives_yaml_path: str, native_yaml_path: str, tags_yaml_path: str
+) -> DerivativeRet:
+    # Do some caching as this is a deterministic function
+    global _GLOBAL_LOAD_DERIVATIVE_CACHE
+    key = (derivatives_yaml_path, native_yaml_path)
+    if key not in _GLOBAL_LOAD_DERIVATIVE_CACHE:
+        with open(derivatives_yaml_path) as f:
+            definitions = yaml.load(f, Loader=YamlLoader)
+
+        funcs = parse_native_yaml(native_yaml_path, tags_yaml_path).native_functions
+        # From the parsed native functions, separate out the (generated) view_copy functions,
+        # so we can generate derivatives for them separately.
+        native_functions_with_view_groups = get_grouped_by_view_native_functions(funcs)
+        native_functions = concatMap(
+            lambda g: [g]
+            if isinstance(g, NativeFunction)
+            else list(g.functions(include_copy=True)),
+            native_functions_with_view_groups,
+        )
+        view_groups = [
+            g
+            for g in native_functions_with_view_groups
+            if isinstance(g, NativeFunctionsViewGroup)
+        ]
+
+        # What's the difference between function schema v.s. signature?
+        # function schema is the complete declaration including mutability annotation / default value and etc.
+        # signature is the canonical schema for a group of functions (in-place/out/functional variants)
+        # that are semantically related.
+        functions_by_signature: Dict[
+            FunctionSchema, List[NativeFunction]
+        ] = defaultdict(list)
+        functions_by_schema: Dict[str, NativeFunction] = {}
+        for function in native_functions:
+            functions_by_signature[function.func.signature()].append(function)
+            assert str(function.func) not in functions_by_schema
+            functions_by_schema[str(function.func)] = function
+
+        # Keep track of how many of which ops we've seen so we can
+        # disambiguate them with a numeric suffix.
+        op_counter = Counter[str]()
+
+        # infos is a dict that maps FunctionSchema -> a dict of per dispatch key DifferentiabilityInfos
+        # this is useful because in tools/autograd/gen_autograd.py:match_differentiability_info
+        # we ultimately need to categorize the DifferentiabilityInfos by FunctionSchema
+        infos: Dict[FunctionSchema, Dict[str, DifferentiabilityInfo]] = {}
+        used_dispatch_keys: Set[str] = set()
+        for defn_dict in definitions:
+            # Ensure that the old derivatives.yaml schema with no dispatch key can be loaded.
+            if "dispatch" not in defn_dict:
+                specification = defn_dict.pop("name")
+                output_differentiability = defn_dict.pop(
+                    "output_differentiability", None
+                )
+                defn_dict = {"name": specification, "dispatch": {"Default": defn_dict}}
+                if output_differentiability:
+                    defn_dict["output_differentiability"] = output_differentiability
+            name, per_dispatch_diffinfos = create_differentiability_info(
+                defn_dict,
+                functions_by_signature,
+                functions_by_schema,
+                op_counter,
+                used_dispatch_keys,
+            )
+            infos[name] = per_dispatch_diffinfos
+
+        add_view_copy_derivatives(infos, view_groups)
+
+        # cache both loaded infos as well a a set of all the dispatch_keys/aliases
+        # that appear in derivatives.yaml. used_dispatch_keys is useful for generating
+        # VariableType.cpp where we need a TORCH_LIBRARY_IMPL for every autograd dispatch key used
+        _GLOBAL_LOAD_DERIVATIVE_CACHE[key] = infos, used_dispatch_keys
+
+    return _GLOBAL_LOAD_DERIVATIVE_CACHE[key]
+
+
+# TODO: Why is this going through CppSignatureGroup, that doesn't make sense...
+@with_native_function
+def cpp_arguments(f: NativeFunction) -> Sequence[Binding]:
+    sigs = CppSignatureGroup.from_native_function(f, method=False)
+    if sigs.symint_signature is not None:
+        return sigs.symint_signature.arguments()
+    else:
+        return sigs.signature.arguments()
+
+
+def create_derivative(
+    f: NativeFunction,
+    formula: str,
+    var_names: Tuple[str, ...],
+    available_named_gradients: Sequence[str],
+) -> Derivative:
+    original_formula = formula
+    arguments: List[NamedCType] = [
+        a.nctype.remove_const_ref() for a in cpp_arguments(f)
+    ]
+
+    return_names = tuple(n if n != "self" else "result" for n in cpp.return_names(f))
+    return_types = tuple(
+        cpp.return_type(r, symint=True).remove_const_ref() for r in f.func.returns
+    )
+
+    named_returns = [
+        NamedCType(name, type) for name, type in zip(return_names, return_types)
+    ]
+
+    formula, saved_inputs = saved_variables(formula, arguments, var_names)
+    formula, saved_outputs = saved_variables(formula, named_returns, var_names)
+
+    used_named_gradients = {
+        name
+        for name in available_named_gradients
+        if re.search(IDENT_REGEX.format(name), formula)
+    }
+
+    # Check that the referenced derivatives in the formula are in bounds
+    for i in used_gradient_indices(formula):
+        if i >= len(f.func.returns):
+            raise RuntimeError(
+                f"Out of bounds grads access: derivative formula for {cpp.name(f.func)} "
+                f"used grads[{i}], but the forward only returns {len(f.func.returns)} outputs."
+            )
+
+    return Derivative(
+        formula=formula,
+        original_formula=original_formula,
+        var_names=var_names,
+        saved_inputs=saved_inputs,
+        saved_outputs=saved_outputs,
+        named_gradients=used_named_gradients,
+    )
+
+
+def create_forward_derivative(
+    f: NativeFunction, formula: str, names: Tuple[str, ...]
+) -> ForwardDerivative:
+    var_names = names
+    var_types: Optional[Tuple[Type, ...]] = None
+    for r in f.func.returns:
+        if r.name in var_names:
+            if var_types is None:
+                var_types = tuple()
+            var_types = var_types + (r.type,)
+
+    # Handle default return names
+    if var_types is None:
+        if var_names == ("result",):
+            assert len(f.func.returns) == 1
+            var_types = (f.func.returns[0].type,)
+        else:
+            for var_name in var_names:
+                res = re.findall(r"^result(\d+)$", var_name)
+                if len(res) == 1:
+                    if var_types is None:
+                        var_types = tuple()
+                    arg_idx = int(res[0])
+                    var_types = var_types + (f.func.returns[arg_idx].type,)
+
+    assert var_types is not None, "No matching output for forward derivative definition"
+    return ForwardDerivative(
+        formula=formula,
+        var_names=var_names,
+        var_types=var_types,
+        required_inputs_fw_grad=None,
+        required_inputs_primal=None,
+        required_original_self_value=False,
+        is_reusing_outplace_formula=False,
+    )
+
+
+def postprocess_forward_derivatives(
+    f: NativeFunction,
+    defn_name: str,
+    all_arg_names: List[str],
+    derivatives: List[Derivative],
+    forward_derivatives: List[ForwardDerivative],
+    args_with_derivatives: Sequence[Binding],
+) -> List[ForwardDerivative]:
+    def find_required_inputs(formula: str, postfix: str) -> Tuple[str, ...]:
+        is_foreach = f.func.name.name.base.startswith("_foreach_")
+        required_inputs = set()
+        for arg in args_with_derivatives:
+            if (
+                arg.type in ("at::TensorList", "const at::ITensorListRef &")
+                and not is_foreach
+            ):
+                # The functions taking TensorList handle everything internally
+                continue
+            arg_name = arg.name
+
+            found = re.search(IDENT_REGEX.format(arg_name), formula)
+            if found:
+                raise RuntimeError(
+                    f"The forward formula for {defn_name} is using the base name of the {arg_name} "
+                    f"argument which is ambiguous. You should use {arg_name}_p to access the primal "
+                    f"value and {arg_name}_t to access the tangent."
+                )
+
+            found = re.search(IDENT_REGEX.format(arg_name + postfix), formula)
+            if found:
+                required_inputs.add(arg_name)
+
+        return tuple(required_inputs)
+
+    updated_derivatives: List[ForwardDerivative] = []
+
+    for defn in forward_derivatives:
+        formula = defn.formula
+        required_inputs_tangent = find_required_inputs(formula, "_t")
+        if formula == "auto_element_wise":
+            assert (
+                f.func.kind() != SchemaKind.inplace
+            ), f"Cannot use auto_element_wise with {f.func.name} because it is an in-place variant"
+            if (
+                (not len(args_with_derivatives) == 1)
+                or len(forward_derivatives) > 1
+                or len(forward_derivatives[0].var_names) > 1
+            ):
+                raise RuntimeError(
+                    f"Derivative definition of {defn_name} in derivatives.yaml defines the "
+                    "forward definition of gradient as element_wise but this only "
+                    "works for functions with a single differentiable input and a "
+                    "single differentiable output."
+                )
+            if not len(derivatives) == 1:
+                raise RuntimeError(
+                    f"Derivative definition of {defn_name} in derivatives.yaml defines the "
+                    "forward definition of gradient as element_wise but it does not "
+                    "defines the gradient formula for its argument which is required."
+                )
+            # This transformation is based on the observation that for element-wise functions, the Jacobian
+            # matrix is diagonal and thus doing J * v is the same as (v^T J)^T (in practice, we ignore the transpositions)
+            # For the complex case, we use hermitian transpose and get (v.conj() J).conj()
+            # So here we are going to re-use the backward formula and replace two things:
+            # 1) all occurrences of "grad" with "foo_t.conj()", where foo is the name of the unique differentiable input.
+            # 2) all usage of an original input "foo" with its primal value "foo_p".
+            # 3) conjugate the final result
+            # For example, for abs, the backward formula is:
+            #   grad * self.sgn()
+            # And this function generates a forward formula that is:
+            #   (self_t.conj() * self_p.sgn()).conj()
+
+            backward_formula = derivatives[0].original_formula
+            input_name = args_with_derivatives[0].name
+
+            # Do replacement 1) of the grad
+            def repl(m: Any) -> str:
+                return f"{m.group(1)}{input_name}_t.conj(){m.group(2)}"
+
+            fw_formula = re.sub(IDENT_REGEX.format("grad"), repl, backward_formula)
+
+            # Do replacement 2) of the input variables
+            for arg in args_with_derivatives:
+                arg_name = arg.name
+
+                def repl(m: Any) -> str:
+                    return f"{m.group(1)}{arg_name}_p{m.group(2)}"
+
+                fw_formula = re.sub(IDENT_REGEX.format(arg_name), repl, fw_formula)
+
+            # Do the final conjugate 3)
+            fw_formula = f"({fw_formula}).conj()"
+
+            # Since there is a single differentiable inputs and we necessarily need its tangent we can
+            # simply require all differentiable input's tangent.
+            required_inputs_tangent = tuple(all_arg_names)
+            formula = fw_formula
+        elif formula == "auto_linear":
+            if (
+                len(forward_derivatives) > 1
+                or len(forward_derivatives[0].var_names) > 1
+            ):
+                raise RuntimeError(
+                    f"Derivative definition of {defn_name} in derivatives.yaml defines the "
+                    "forward definition of gradient as linear but this only works "
+                    "for functions with a single differentiable output."
+                )
+            # This transformation is based on the observation that linear functions can be written as:
+            #   y = f(x) = A * x
+            # For some matrix A and the Jacobian of the function f is also A.
+            # So doing J * v = A * v = f(v).
+            # Hence to do the jvp, we simply need to evaluate the function at the point v instead of x.
+            # We do this by calling the forward again by replacing any occurrence of the differentiable
+            # input "foo" by it's tangent "foo_t".
+            # Note that multiple inputs are not a problem as long as the function is truly linear wrt to
+            # the vector where all the differentiable inputs are stacked.
+
+            diff_arg_names = [arg.name for arg in args_with_derivatives]
+            assert len(diff_arg_names) > 0
+
+            # Do replacement of input variables
+            new_args = []
+            for arg_name in all_arg_names:
+                if arg_name in diff_arg_names:
+                    arg_name = arg_name + "_t"
+                new_args.append(arg_name)
+
+            # TODO we are trolling
+            if f.func.has_symint():
+                defn_name += "_symint"
+
+            # Call into the forward again. We need two cases here to handle both Tensor methods and at:: functions.
+            if Variant.function in f.variants:
+                fw_formula = f"at::{defn_name}({', '.join(new_args)})"
+            else:
+                assert Variant.method in f.variants
+                fw_formula = f"{new_args[0]}.{defn_name}({', '.join(new_args[1:])})"
+
+            # All of the input tangents are always used so all of them are required here.
+            required_inputs_tangent = tuple(diff_arg_names)
+            formula = fw_formula
+
+        # At this point, the formula is final and is not modified anymore.
+
+        # During forward formula, we use the primal instead of the input Tensors.
+        # This call inspects the formula to find for which input's primal are used.
+        required_inputs_primal = find_required_inputs(formula, "_p")
+
+        updated_derivatives.append(
+            ForwardDerivative(
+                formula=formula,
+                var_names=defn.var_names,
+                var_types=defn.var_types,
+                required_inputs_fw_grad=required_inputs_tangent,
+                required_inputs_primal=required_inputs_primal,
+                required_original_self_value=False,
+                is_reusing_outplace_formula=False,
+            )
+        )
+
+    return updated_derivatives
+
+
+def is_forward_derivative_definition(
+    all_arg_names: List[str], names: Tuple[str, ...]
+) -> bool:
+    for name in names:
+        if name not in all_arg_names:
+            return True
+        else:
+            return False
+    raise RuntimeError("Expected `names` to be non-empty")
+
+
+def create_differentiability_info(
+    defn_dict: Dict[Any, Any],
+    functions_by_signature: Dict[FunctionSchema, List[NativeFunction]],
+    functions_by_schema: Dict[str, NativeFunction],
+    op_counter: Counter[str],
+    used_dispatch_keys: Set[str],
+) -> Tuple[FunctionSchema, Dict[str, DifferentiabilityInfo]]:
+    """Processes a single entry `defn` in derivatives.yaml"""
+
+    def canonical_function(
+        functions: Sequence[NativeFunction], name: str
+    ) -> NativeFunction:
+        for f in functions:
+            if (
+                not f.func.is_functional_fn()
+                and not f.func.is_out_fn()
+                and name == str(f.func.name.name)
+            ):
+                return f
+        # some functions only have in-place variants
+        assert name + "_" == cpp.name(functions[0].func)
+        return functions[0]
+
+    def split_names(raw_names: str) -> Tuple[str, ...]:
+        """Given "foo, bar", return ["foo", "bar"]."""
+        return tuple(x.strip() for x in raw_names.split(","))
+
+    def check_grad_usage(defn_name: str, derivatives: Sequence[Derivative]) -> None:
+        """
+        Check for some subtle mistakes one might make when writing derivatives.
+        These mistakes will compile, but will be latent until a function is
+        used with double backwards.
+        """
+
+        uses_grad = False  # true if any derivative uses "grad"
+        num_grads_uses = 0  # count of uses of "grads" or "grads[INDEX]"
+        uses_named_grads = False  # true if any derivative uses "grad_{name}"
+        used_grads_indices: List[int] = []  # which indices of grads are used
+        for d in derivatives:
+            formula = d.formula
+            uses_grad = uses_grad or bool(
+                re.findall(IDENT_REGEX.format("grad"), formula)
+            )
+            num_grads_uses += len(re.findall(IDENT_REGEX.format("grads"), formula))
+            uses_named_grads = uses_named_grads or bool(d.named_gradients)
+            used_grads_indices.extend(used_gradient_indices(formula))
+        # This is a basic sanity check: the number of places we see
+        # "grads" should be no fewer than the number of indices we see
+        # inside "grads". They may not be equal because we may use
+        # "grads" without an index.
+        assert num_grads_uses >= len(used_grads_indices)
+        # Thus if the number is equal, every use of grads is also
+        # indexed.
+        only_used_grads_indices = num_grads_uses == len(used_grads_indices)
+
+        if uses_grad and num_grads_uses > 0:
+            raise RuntimeError(
+                f"Derivative definition of {defn_name} in derivatives.yaml illegally "
+                "mixes use of 'grad' and 'grads'. Consider replacing "
+                "occurrences of 'grad' with 'grads[0]'"
+            )
+
+        if only_used_grads_indices and set(used_grads_indices) == {0}:
+            raise RuntimeError(
+                f"Derivative definition of {defn_name} in derivatives.yaml solely "
+                "refers to 'grads[0]'.  If the first output is indeed the "
+                "only differentiable output, replace 'grads[0]' with 'grad'; "
+                "otherwise, there is a likely error in your derivatives "
+                "declaration."
+            )
+
+        if uses_named_grads and (uses_grad or num_grads_uses > 0):
+            raise RuntimeError(
+                f"Derivative definition of {defn_name} in derivatives.yaml illegally "
+                'mixes use of "grad_RETURN_NAME" and "grad" or "grads[x]". Use '
+                "only one method for identifying gradients."
+            )
+
+    @with_native_function
+    def set_up_derivatives(
+        f: NativeFunction,
+    ) -> Tuple[
+        Sequence[Derivative],
+        Sequence[ForwardDerivative],
+        Sequence[Binding],
+        Sequence[str],
+        Sequence[str],
+    ]:
+        # Set up the derivative information
+        derivatives: List[Derivative] = []
+        forward_derivatives: List[ForwardDerivative] = []
+        non_differentiable_arg_names: List[str] = []
+        args_with_derivatives_set: Set[str] = set()
+
+        all_arg_names = [a.name for a in cpp_arguments(f)]
+        all_ret_names = [
+            r.name for r in f.func.returns
+        ]  # only used for the assert below
+        # output_differentiability is captured from the enclosed
+        # scope. Don't modify it.
+        #
+        # If it is not present, then no output is explicitly
+        # undifferentiable.
+        #
+        # It may be present and shorter than the length of return
+        # values. If that's the case, any return value that does not
+        # have a corresponding entry is considered not differentiable.
+        differentiability = output_differentiability or [True] * len(f.func.returns)
+        # A return is available as a named gradient ...
+        available_named_gradients = [
+            f"grad_{ret.name}"
+            for ret, differentiable in zip(f.func.returns, differentiability)
+            # if it has not been explicitly made undifferentiable
+            if differentiable
+            # and if it has a name
+            and ret.name is not None
+            # and if its type is differentiable
+            and ret.type.is_tensor_like()
+        ]
+
+        for raw_names in sorted(defn.keys()):
+            formula = defn[raw_names]
+            names = split_names(raw_names)
+
+            for name in names:
+                assert not (name in all_arg_names and name in all_ret_names), (
+                    f"While processing the derivative formula for '{f.func.name}' wrt '{name}', "
+                    f"expected '{name}' to not be both an input arg and named return. "
+                )
+
+            if is_forward_derivative_definition(all_arg_names, names):
+                forward_derivatives.append(create_forward_derivative(f, formula, names))
+            else:
+                if formula.lower().strip() == "non_differentiable":
+                    non_differentiable_arg_names += names
+                else:
+                    derivative = create_derivative(
+                        f, formula, names, available_named_gradients
+                    )
+                    derivatives.append(derivative)
+                    args_with_derivatives_set |= set(names)
+
+        overlap = args_with_derivatives_set.intersection(non_differentiable_arg_names)
+        if overlap:
+            raise RuntimeError(
+                f"derivatives definition for {defn} have overlapped non_differentiable "
+                f"and differentiable variables: {overlap}"
+            )
+
+        # Next, let us determine the list of inputs in order.
+        # TODO: do we need eagerly calculate and save it here? Can it be derived
+        # from NativeFunction and `derivatives` on callsites instead?
+        args_with_derivatives = [
+            a for a in cpp_arguments(f) if a.name in args_with_derivatives_set
+        ]
+
+        # Postprocess forward derivatives definitions now that we know the differentiable arguments
+        forward_derivatives = postprocess_forward_derivatives(
+            f,
+            defn_name,
+            all_arg_names,
+            derivatives,
+            forward_derivatives,
+            args_with_derivatives,
+        )
+
+        # Test to see if the use of 'grads' makes sense.
+        check_grad_usage(defn_name, derivatives)
+
+        return (
+            derivatives,
+            forward_derivatives,
+            args_with_derivatives,
+            non_differentiable_arg_names,
+            available_named_gradients,
+        )
+
+    # NB: Removes 'name' from defn dictionary
+    specification = defn_dict.pop("name")
+    defn_name, _ = split_name_params(specification)
+    # NB: Removes 'output_differentiability' from defn dictionary
+    #     `None` means all differentiable.
+    output_differentiability = defn_dict.pop("output_differentiability", None)
+    output_differentiability_conditions = None
+    if output_differentiability and any(
+        isinstance(diff, str) for diff in output_differentiability
+    ):
+        if len(output_differentiability) != 1:
+            raise RuntimeError(
+                f"Not supported: for {specification},"
+                f"output_differentiability must either be "
+                f"List[bool] or a List[str] where each str is a "
+                f"condition. In the case where it is a condition, "
+                f"we only support single-output functions. "
+                f"Please file us an issue. "
+            )
+        output_differentiability_conditions = output_differentiability
+        output_differentiability = [True]
+
+    schema_function = functions_by_schema.get(specification)
+    if not schema_function:
+        avail = "\n".join(
+            k for k, v in functions_by_schema.items() if cpp.name(v.func) == defn_name
+        )
+        raise RuntimeError(
+            f"could not find ATen function for schema: {specification} "
+            f".  Available signatures:\n{avail}"
+        )
+
+    # now map this to the legacy schema; this isn't technically necessary, but we'd need some logic here
+    # to map in-place schemas to the out-of-place variants.
+    # TODO: maybe the logic to handle the legacy schema is no longer necessary?
+    signature = schema_function.func.signature()
+    functions = functions_by_signature[signature]
+    if len(functions) == 0:
+        avail = "\n".join(
+            str(k)
+            for k, v in functions_by_signature.items()
+            if cpp.name(k) == defn_name
+        )
+        raise RuntimeError(
+            f"could not find ATen function for legacy signature: {signature} "
+            f"corresponding to schema {specification}.  Please report a bug to PyTorch. "
+            f"Available signatures:\n{avail}"
+        )
+
+    canonical = canonical_function(functions, defn_name)
+    if "grad_input_mask" in (a.name for a in cpp_arguments(canonical)):
+        raise RuntimeError(
+            f"Schema for {defn_name} has an argument named grad_input_mask, "
+            "but this name would be shadowed by our codegen. "
+            "Please use a different name in native_functions.yaml."
+        )
+
+    if "result" in (a.name for a in cpp_arguments(canonical)):
+        raise RuntimeError(
+            f"Schema for {defn_name} has an argument named result, "
+            "but this is only allowed for outputs."
+            "Please use a different name in native_functions.yaml."
+        )
+
+    diffinfo_dict = {}
+    for key, defn in defn_dict["dispatch"].items():
+        if key != "Default" and key not in _VALID_AUTOGRAD_KEYS:
+            raise RuntimeError(
+                f"Invalid dispatch key {key} in derivatives.yaml for {specification},"
+                f" expected key to be one of {_VALID_AUTOGRAD_KEYS}"
+            )
+        if key not in used_dispatch_keys:
+            used_dispatch_keys.add(key)
+
+        (
+            derivatives,
+            forward_derivatives,
+            args_with_derivatives,
+            non_differentiable_arg_names,
+            available_named_gradients,
+        ) = set_up_derivatives(canonical)
+
+        used_named_gradients: Set[str] = set()
+        for d in derivatives:
+            used_named_gradients |= d.named_gradients
+
+        # only assign an op name if we are actually going to calculate a derivative
+        op = None
+        if args_with_derivatives:
+            op_prefix = _create_op_prefix(defn_name)
+            if key != "Default":
+                op_prefix = op_prefix + key
+            op = f"{op_prefix}{op_counter[op_prefix]}"
+            op_counter[op_prefix] += 1
+
+        diffinfo_dict[key] = DifferentiabilityInfo(
+            name=defn_name,
+            func=canonical,
+            op=op,
+            derivatives=derivatives,
+            forward_derivatives=forward_derivatives,
+            all_saved_inputs=dedup_vars(
+                [v for d in derivatives for v in d.saved_inputs]
+            ),
+            all_saved_outputs=dedup_vars(
+                [v for d in derivatives for v in d.saved_outputs]
+            ),
+            available_named_gradients=available_named_gradients,
+            used_named_gradients=used_named_gradients,
+            args_with_derivatives=args_with_derivatives,
+            non_differentiable_arg_names=non_differentiable_arg_names,
+            output_differentiability=output_differentiability,
+            output_differentiability_conditions=output_differentiability_conditions,
+        )
+
+    return canonical.func, diffinfo_dict
+
+
+GRAD_INDEX_REGEX = r"(?:^|\W)grads\[(\d+)\]"
+
+
+def used_gradient_indices(formula: str) -> List[int]:
+    """Determine a list of gradient indices (the i in grads[i]) that
+    are used by the formula.
+
+    >>> used_gradient_indices("foo(grads[0], grads[1])")
+    [0, 1]
+    """
+    return [int(i) for i in re.findall(GRAD_INDEX_REGEX, formula)]
+
+
+def saved_variables(
+    formula: str,
+    nctypes: List[NamedCType],
+    var_names: Tuple[str, ...],
+) -> Tuple[str, Tuple[SavedAttribute, ...]]:
+    def stride_expr(name: str) -> str:
+        assert var_names == (name,), (
+            'Replacement for ".strides()" is currently only supported for single derivatives of the same tensor '
+            'that ".strides()" is being called on.'
+        )
+        return f'strides_or_error({name}, "{name}")'
+
+    REPLACEMENTS: List[Tuple[str, Dict[str, Any]]] = [
+        # replace self.sym_sizes() with self_sym_sizes
+        (
+            r"{}.sym_sizes\(\)",
+            {
+                "suffix": "_sym_sizes",
+                "nctype": lambda name: NamedCType(name, BaseCType(symIntArrayRefT)),
+            },
+        ),
+        # replace self->sym_sizes() with self_sym_sizes_opt
+        (
+            r"{}->sym_sizes\(\)",
+            {
+                "suffix": "_sym_sizes_opt",
+                "nctype": lambda name: NamedCType(
+                    name, OptionalCType(BaseCType(symIntArrayRefT))
+                ),
+                "expr": lambda name: f"{name}.has_value() ? c10::optional<c10::SymIntArrayRef>({name}->sym_sizes()) : c10::nullopt",
+            },
+        ),
+        # replace self.sym_blocksize() with self_sym_blocksize_opt
+        (
+            r"{}.sym_blocksize\(\)",
+            {
+                "suffix": "_self_sym_blocksize_opt",
+                "nctype": lambda name: NamedCType(
+                    name, OptionalCType(BaseCType(symIntArrayRefT))
+                ),
+                "expr": lambda name: f"at::sparse_csr::getSymIntBlockSize({name})",
+            },
+        ),
+        # replace self.options() with self_options
+        (
+            r"{}.options\(\)",
+            {
+                "suffix": "_options",
+                "nctype": lambda name: NamedCType(name, BaseCType(tensorOptionsT)),
+            },
+        ),
+        # replace zeros_like(self) with self_info
+        (
+            r"zeros_like\({}\)",
+            {
+                "suffix": "_info",
+                "nctype": lambda name: NamedCType(name, BaseCType(typeAndSizeT)),
+                "expr": lambda name: name,  # at save-time
+                "res": lambda name: name + "_info.zeros()",  # at eval-time
+            },
+        ),
+        # replace self.sym_size(2) with self_sym_size_2
+        (
+            r"{}.sym_size\((-?\w+)\)",
+            {
+                "suffix": lambda m: f"_sym_argsize_{m.groups()[0].replace('-', 'minus_')}",
+                "nctype": lambda name: NamedCType(name, BaseCType(SymIntT)),
+            },
+        ),
+        # replace self.numel() with self_numel
+        (
+            r"{}.numel\(\)",
+            {
+                "suffix": "_numel",
+                "nctype": lambda name: NamedCType(name, BaseCType(longT)),
+            },
+        ),
+        # replace self.sym_numel() with self_sym_numel
+        (
+            r"{}.sym_numel\(\)",
+            {
+                "suffix": "_sym_numel",
+                "nctype": lambda name: NamedCType(name, BaseCType(SymIntT)),
+            },
+        ),
+        # replace to_args_sizes(self) with self_args_sizes
+        (
+            r"to_args_sizes\({}\)",
+            {
+                "suffix": "_args_sizes",
+                "nctype": lambda name: NamedCType(
+                    name, VectorCType(VectorCType(BaseCType(longT)))
+                ),
+            },
+        ),
+        # replace to_args_sizes_symint(self) with self_args_sizes
+        (
+            r"to_args_sizes_symint\({}\)",
+            {
+                "suffix": "_args_sizes_symint",
+                "nctype": lambda name: NamedCType(
+                    name, VectorCType(VectorCType(BaseCType(SymIntT)))
+                ),
+            },
+        ),
+        # replace to_args_scalartypes(self) with self_args_scalartypes
+        (
+            r"to_args_scalartypes\({}\)",
+            {
+                "suffix": "_args_scalartypes",
+                "nctype": lambda name: NamedCType(
+                    name, VectorCType(BaseCType(scalarTypeT))
+                ),
+            },
+        ),
+        # replace TensorGeometry(self) with self_geometry
+        (
+            r"TensorGeometry\({}\)",
+            {
+                "suffix": "_geometry",
+                "nctype": lambda name: NamedCType(name, BaseCType(tensorGeometryT)),
+            },
+        ),
+        (
+            r"{}.scalar_type\(\)",
+            {
+                "suffix": "_scalar_type",
+                "nctype": lambda name: NamedCType(name, BaseCType(scalarTypeT)),
+            },
+        ),
+        # replace self.dim() with self_dim
+        (
+            r"{}.dim\(\)",
+            {
+                "suffix": "_dim",
+                "nctype": lambda name: NamedCType(name, BaseCType(longT)),
+            },
+        ),
+        # replace self.sym_strides() with self_sym_strides
+        (
+            r"{}.sym_strides\(\)",
+            {
+                "suffix": "_sym_strides",
+                "nctype": lambda name: NamedCType(name, BaseCType(symIntArrayRefT)),
+                "expr": stride_expr,
+            },
+        ),
+        # replace self.layout() with self_layout
+        (
+            r"{}.layout\(\)",
+            {
+                "suffix": "_layout",
+                "nctype": lambda name: NamedCType(name, BaseCType(layoutT)),
+            },
+        ),
+        # replace self.is_conj() with self_conjugate
+        (
+            r"{}.is_conj\(\)",
+            {
+                "suffix": "_conjugate",
+                "nctype": lambda name: NamedCType(name, BaseCType(boolT)),
+            },
+        ),
+    ]
+
+    # find which arguments need to be saved
+    saved: List[SavedAttribute] = []
+
+    if ".sizes()" in formula or "->sizes()" in formula:
+        raise RuntimeError(
+            ".sizes() is not supported in derivative formulas. Instead, please use the SymInt version,"
+            + f".sym_sizes(), which returned a c10::SymIntArrayRef. formula={formula}"
+        )
+    if re.search(r"\.size\([-]?\d+\)", formula) or re.search(
+        r"->size\([-]?\d+\)", formula
+    ):
+        raise RuntimeError(
+            ".size(int) is not supported in derivative formulas. Instead, please use the SymInt version,"
+            + f".sym_size(int), which returned a c10::SymIntArrayRef. formula={formula}"
+        )
+    if ".strides()" in formula or "->strides()" in formula:
+        raise RuntimeError(
+            ".strides() is not supported in derivative formulas. Instead, please use the SymInt version,"
+            + f".sym_strides(), which returned a c10::SymIntArrayRef. formula={formula}"
+        )
+    for nctype in nctypes:
+        name = (
+            nctype.name.name if isinstance(nctype.name, SpecialArgName) else nctype.name
+        )
+        # First search the formula for expressions which can be evaluated
+        # when the autograd Function is created to avoid saving variables
+        for regex, info in REPLACEMENTS:
+
+            def repl(m: Match[str]) -> str:
+                suffix: str = (
+                    info["suffix"](m) if callable(info["suffix"]) else info["suffix"]
+                )
+                expr: str = info["expr"](name) if "expr" in info else m.group(0)
+                saved.append(
+                    SavedAttribute(
+                        nctype=info["nctype"](name + suffix),
+                        expr=expr,
+                    )
+                )
+                if "res" in info:
+                    replacement: str = info["res"](name)
+                    return replacement
+                return name + suffix
+
+            formula = re.sub(regex.format(name), repl, formula)
+
+        # c10::optional<std::string> types stored in Backward nodes must be
+        # converted to c10::optional<c10::string_view> before being passed into
+        # the backward function
+        if nctype.type == OptionalCType(BaseCType(stringT)):
+            formula = re.sub(
+                rf"\b{name}\b",
+                f"{name}.has_value() ? c10::optional<c10::string_view>({name}.value()) : c10::nullopt",
+                formula,
+            )
+
+        # Find any variables which remain in the formula and save them
+        if re.search(IDENT_REGEX.format(name), formula):
+            saved.append(
+                SavedAttribute(
+                    nctype=nctype,
+                    expr=name,
+                )
+            )
+
+    return formula, tuple(saved)
+
+
+def _create_op_prefix(name: str) -> str:
+    """Takes a native function name converts to a op prefix name.
+
+    Note that the "name" parameter must be the native function name
+    without the optional variant suffix, so "add" instead of
+    "add.out".
+
+    OP names correspond to classes, hence the change to title case.
+
+    Example::
+    >>> _create_op_prefix('add')
+    'AddBackward'
+    """
+    camel_case = "".join([p.title() for p in name.split("_")])
+    return (camel_case + "Backward").replace("ForwardBackward", "Backward")
+
+
+def dedup_vars(vars: Sequence[SavedAttribute]) -> Sequence[SavedAttribute]:
+    seen: Set[str] = set()
+    saved: List[SavedAttribute] = []
+    for var in vars:
+        name = (
+            var.nctype.name.name
+            if isinstance(var.nctype.name, SpecialArgName)
+            else var.nctype.name
+        )
+        if name in seen:
+            continue
+        seen.add(name)
+        saved.append(var)
+    return saved
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/ADInplaceOrViewType.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/ADInplaceOrViewType.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f02837e9d4825a409eb4061195de1e4b8d21b928
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/ADInplaceOrViewType.cpp
@@ -0,0 +1,38 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include "torch/csrc/autograd/VariableTypeUtils.h"
+#include "torch/csrc/autograd/generated/ViewFuncs.h"
+
+#include <torch/library.h>
+#include <ATen/FunctionalInverses.h>
+#include <ATen/FunctionalTensorWrapper.h>
+
+// ${generated_comment}
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+$ops_headers
+#endif
+
+using namespace at;
+using torch::autograd::CreationMeta;
+using torch::autograd::as_view;
+using torch::autograd::increment_version;
+
+namespace torch {
+
+namespace ADInplaceOrView {
+
+namespace {
+${inplace_or_view_method_definitions}
+}  // namespace
+}  // namespace ADInplaceOrView
+
+namespace {
+
+TORCH_LIBRARY_IMPL(aten, ADInplaceOrView, m) {
+  ${inplace_or_view_wrapper_registrations};
+}
+
+}  // namespace
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e2fda79d52e56e2be935b8c346fbff73c57f3f08
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.cpp
@@ -0,0 +1,20 @@
+#include "torch/csrc/autograd/FunctionsManual.h"
+#include "torch/csrc/dynamo/compiled_autograd.h"
+
+// ${generated_comment}
+
+// The manual function definitions that used to be here are now in torch/csrc/autograd/FunctionsManual.cpp
+// This speeds up re-compilation and allow to share these implementations so that they can be
+// used for forward mode AD formulas as well.
+
+using namespace torch::autograd::generated::details;
+using at::Tensor;
+using at::Scalar;
+using at::IntArrayRef;
+using at::TensorList;
+
+namespace torch::autograd::generated {
+
+${autograd_function_definitions}
+
+} // namespace torch::autograd::generated
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.h b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..19ce470a5839f5b88f1250f1243167d8a21a40fb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.h
@@ -0,0 +1,51 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/ATen.h>
+#include <ATen/core/functional.h>
+#include <ATen/TensorGeometry.h>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include <torch/csrc/Export.h>
+
+#include <c10/core/SymIntArrayRef.h>
+
+namespace torch { namespace autograd { namespace generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::TensorGeometry;
+using at::ScalarType;
+using c10::optional;
+using c10::fmap;
+
+inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  // NB: we must explicitly do the conversion in the lambda, otherwise template
+  // deduction will give a Tensor of Variable which is not convertible
+  return fmap(xs, [&saved_for](const SavedVariable& x) {
+    // TODO(crcrpar): Use `std::move(saved_for)` to avoid incrementing refcount, which would need refactoring.
+    return static_cast<Tensor>(x.unpack(saved_for));
+  });
+}
+
+inline c10::List<c10::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  torch::List<c10::optional<Tensor>> result;
+  result.reserve(xs.size());
+  for (const SavedVariable& v : xs) {
+    auto var = v.unpack(saved_for);
+    result.push_back(var.defined() ? c10::optional<Tensor>(var) : c10::nullopt);
+  }
+  return result;
+}
+
+using torch::autograd::TypeAndSize;
+
+${autograd_function_declarations}
+
+}}} // namespace torch::autograd::generated
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/TraceType.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/TraceType.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b0feeb21cc2666248ef283bfbd6a2355b957863
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/TraceType.cpp
@@ -0,0 +1,40 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include "torch/csrc/jit/frontend/tracer.h"
+
+#include <torch/library.h>
+
+#include "torch/csrc/autograd/function.h"
+
+#include "ATen/quantized/Quantizer.h"
+
+// ${generated_comment}
+
+// See the `Tracer` section in `torch/csrc/jit/OVERVIEW.md`.
+// NOTE See [Sharded File] comment in VariableType
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+$ops_headers
+#endif
+
+using namespace at;
+
+namespace torch {
+
+namespace TraceType {
+
+namespace {
+${trace_method_definitions}
+}  // namespace
+}  // namespace TraceType
+
+namespace {
+
+TORCH_LIBRARY_IMPL(aten, Tracer, m) {
+  ${trace_wrapper_registrations};
+}
+
+}  // namespace
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30807a016fbb6003ce05f143d47edea07cf08ed7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.cpp
@@ -0,0 +1,65 @@
+#include "torch/csrc/autograd/VariableTypeUtils.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/autograd/FunctionsManual.h"
+
+#include <ATen/RedispatchFunctions.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+#include <ATen/core/TorchDispatchUtils.h>
+#include <torch/library.h>
+
+#include <ATen/SparseCsrTensorUtils.h>
+
+
+// ${generated_comment}
+
+// NOTE [Sharded File]: on this file's split-into-shards state
+//
+// Back in the good old days, VariableType.cpp was generated as one
+// file with every function in it, and everything was great and
+// simple.
+//
+// However, this file was also very large (over 36,000 lines), and
+// compiling it was very slow, and in fact was a significant
+// bottleneck for incremental rebuilds. To address this, we now
+// generate the file split across multiple shards, named
+// VariableType_0.cpp and so on, which can be compiled in parallel.
+//
+// For ease of inspection and debugging, so that it's not necessary to
+// go rooting around in multiple files, we also generate all the
+// functions together in VariableTypeEverything.cpp. This generated
+// file is only for convenience; it's not actually used in the
+// build. If the file you're looking at now is one of the shards, you
+// may want to switch over to the Everything variant to make you
+// grepping smoother.
+
+using namespace at;
+using namespace torch::autograd::generated;
+using namespace torch::autograd::generated::details;
+
+
+namespace torch::autograd {
+
+namespace VariableType {
+namespace{
+  C10_UNUSED void reset_grad_accumulator(Variable & self) {
+    AutogradMeta* meta = torch::autograd::impl::get_autograd_meta(self);
+    if (meta != nullptr) {
+      meta->grad_accumulator_.reset();
+    }
+  }
+}
+
+namespace {
+
+
+${type_derived_method_definitions}
+}
+}
+
+namespace {
+
+${wrapper_registrations}
+
+}
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.h b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d3383cc07e577abafd681b5bf771f94a6646573
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.h
@@ -0,0 +1,59 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Context.h>
+
+#include <c10/util/intrusive_ptr.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+
+#include <cstdint> // for size_t
+#include <functional> // for function
+#include <memory> // for unique_ptr
+#include <string>
+#include <vector>
+
+namespace at {
+  struct Quantizer;
+};
+
+namespace torch { namespace autograd {
+
+using Variable = at::Tensor;
+using at::Context;
+using at::Device;
+using at::Dimname;
+using at::DimnameList;
+using at::Generator;
+using at::IntArrayRef;
+using at::MemoryFormat;
+using at::QScheme;
+using at::Scalar;
+using at::ScalarType;
+using at::Storage;
+using at::Tensor;
+using at::TensorList;
+using at::TensorOptions;
+using at::Quantizer;
+// This is temporary typedef to enable Quantizer in aten native function API
+// we'll remove them when we are actually exposing Quantizer class
+// to frontend
+using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
+using c10::optional;
+
+namespace VariableType {
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allCUDATypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allXPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allCPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allPrivateUser1Types();
+
+  at::Tensor & unpack(Tensor & t, const char * name, int pos);
+  const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
+  at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
+  std::vector<at::Tensor> unpack(const at::ITensorListRef& tl, const char *name, int pos);
+};
+
+}} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c7bac1bcc471be9c740c3304f33bae5d2e2ef9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/autograd/generated/ViewFuncs.h>
+
+// ${generated_comment}
+
+using at::Tensor;
+using at::Scalar;
+using at::IntArrayRef;
+using at::TensorList;
+
+namespace torch::autograd::generated {
+
+${view_func_definitions}
+
+} // namespace torch::autograd::generated
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.h b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.h
new file mode 100644
index 0000000000000000000000000000000000000000..d35c791e2215c84fa58a16bcc360f5cc464d4273
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <torch/library.h>
+#include <torch/csrc/autograd/variable.h>
+#include <c10/core/SymIntArrayRef.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+$ops_headers
+#endif
+
+namespace torch::autograd::generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::ScalarType;
+using c10::optional;
+using c10::fmap;
+
+${view_func_declarations}
+
+} // namespace torch::autograd::generated
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/annotated_fn_args.py.in b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/annotated_fn_args.py.in
new file mode 100644
index 0000000000000000000000000000000000000000..bd219be4268759a52e0bceb9548616ba0fffacc8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/annotated_fn_args.py.in
@@ -0,0 +1,11 @@
+"""
+This file is needed for generating procedural tests required for
+testing __torch_function__. See tests/test_overrides.py.
+"""
+
+# flake8: noqa
+import torch
+
+annotated_args = {
+${annotated_args}
+}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_enum_tag.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_enum_tag.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f96a3a3663dd15a056adc68b75d8f01fdc565ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_enum_tag.cpp
@@ -0,0 +1,15 @@
+#include <torch/csrc/autograd/python_enum_tag.h>
+#include <torch/csrc/utils/pybind.h>
+#include <pybind11/pybind11.h>
+#include <ATen/core/enum_tag.h>
+
+namespace py = pybind11;
+namespace torch {
+    namespace autograd {
+    void initEnumTag(PyObject* module) {
+        auto m = py::handle(module).cast<py::module>();
+        py::enum_<at::Tag>(m, "Tag")
+        ${enum_of_valid_tags};
+        m.doc() = "An Enum that contains tags that can be assigned to an operator registered in C++.";
+    }
+}}
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_fft_functions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_fft_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ff38cb7b71de1a8b0b2a25c42f5fc836fced426
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_fft_functions.cpp
@@ -0,0 +1,81 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_fft_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using torch::utils::check_out_type_matches;
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef fft_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPFFTVariableFunctionsModule = NULL;
+
+void initFFTFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._fft",
+     NULL,
+     -1,
+     fft_functions
+  };
+  PyObject* fft = PyModule_Create(&def);
+  THPFFTVariableFunctionsModule = fft;
+  if (!fft) {
+    throw python_error();
+  }
+  // steals a reference to fft
+  if (PyModule_AddObject(module, "_fft", fft) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e7d1ee94c3972b0861395f90a84276b57892097
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.cpp
@@ -0,0 +1,37 @@
+#include <torch/csrc/autograd/generated/python_functions.h>
+
+// ${generated_comment}
+
+#include <Python.h>
+#include <ATen/ATen.h>
+
+#include <c10/core/SymNodeImpl.h>
+#include "torch/csrc/autograd/generated/Functions.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/utils/pybind.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/utils/pybind.h>
+
+// NOTE: See [Sharded File] comment in VariableType
+
+namespace torch::autograd::generated {
+
+template<typename C>
+static void addClass(PyObject* module, PyTypeObject& type, const char* name,
+  PyGetSetDef* function_properties=NULL, PyMethodDef* function_methods=NULL)
+{
+  _initFunctionPyTypeObject(type, name, function_properties, function_methods);
+  Py_INCREF(&type);
+  PyModule_AddObject(module, name, (PyObject*)&type);
+  registerCppFunction(typeid(C), &type);
+}
+
+${py_function_props_and_getters}
+
+void initialize_autogenerated_functions${shard_id}(PyObject* module) {
+  ${py_function_initializers}
+}
+
+} // namespace torch::autograd::generated
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.h b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..92919a630ca201ca05ce1090e07389a5dcca6453
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <Python.h>
+
+// ${generated_comment}
+
+// Python bindings for automatically generated autograd functions
+
+namespace torch { namespace autograd { namespace generated {
+
+${shard_forward_declare}
+
+inline void initialize_autogenerated_functions(PyObject* module) {
+  ${shard_call}
+}
+
+}}} // namespace torch::autograd::generated
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_linalg_functions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_linalg_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ba116e6167092fefbbd71d5528700e4e7a34cd40
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_linalg_functions.cpp
@@ -0,0 +1,68 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_linalg_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Scalar;
+using at::ScalarType;
+using at::MemoryFormat;
+using at::Generator;
+using at::IntArrayRef;
+using at::TensorList;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef linalg_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPLinalgVariableFunctionsModule = NULL;
+
+void initLinalgFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._linalg",
+     NULL,
+     -1,
+     linalg_functions
+  };
+  PyObject* linalg = PyModule_Create(&def);
+  THPLinalgVariableFunctionsModule = linalg;
+  if (!linalg) {
+    throw python_error();
+  }
+  // steals a reference to linalg
+  if (PyModule_AddObject(module, "_linalg", linalg) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_nested_functions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_nested_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..162904598d52dc4007a1cf29cd798cc4ef5b29dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_nested_functions.cpp
@@ -0,0 +1,81 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_nested_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::OptionalIntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef nested_functions[] = {
+  {NULL, NULL, 0, NULL},
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPNestedVariableFunctionsModule = NULL;
+
+void initNestedFunctions(PyObject* module) {
+  nested_functions[0] = get_nested_functions_manual()[0];
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._nested",
+     NULL,
+     -1,
+     nested_functions
+  };
+  PyObject* nested = PyModule_Create(&def);
+  THPNestedVariableFunctionsModule = nested;
+  if (!nested) {
+    throw python_error();
+  }
+  // steals a reference to nested
+  if (PyModule_AddObject(module, "_nested", nested) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_nn_functions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_nn_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f2e03872149d2d3e224c94cc529c7a1fc4d9732
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_nn_functions.cpp
@@ -0,0 +1,113 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_nn_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/tensor_memoryformats.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Scalar;
+using at::MemoryFormat;
+using at::Generator;
+using at::IntArrayRef;
+using at::ArrayRef;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+static PyObject* THPNNVariableFunctionsModule = NULL;
+
+static PyObject * THPVariable__parse_to(PyObject* module, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "to(Device device=None, ScalarType dtype=None, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+    "to(ScalarType dtype, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+    "to(Tensor tensor, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+  });
+  ParsedArgs<5> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.has_torch_function()) {
+    return handle_torch_function(r, args, kwargs, THPNNVariableFunctionsModule, "torch.nn", "_parse_to");
+  }
+  auto parsed = parse_to_conversion(r, /*allow_copy*/ false); // we don't want copy for nn.Module.to
+  auto& device = std::get<0>(parsed);
+  auto& scalarType = std::get<1>(parsed);
+  auto non_blocking = std::get<2>(parsed);
+  auto opt_memory_format = std::get<4>(parsed);
+  auto tuple = THPObjectPtr{PyTuple_New(4)};
+  if (!tuple) throw python_error();
+  if (device) {
+    PyTuple_SET_ITEM(tuple.get(), 0, THPDevice_New(*device));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 0, Py_None);
+  }
+  if (scalarType) {
+    PyTuple_SET_ITEM(tuple.get(), 1, torch::autograd::utils::wrap(torch::getTHPDtype(*scalarType)));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 1, Py_None);
+  }
+  PyTuple_SET_ITEM(tuple.get(), 2, torch::autograd::utils::wrap(non_blocking));
+  if (opt_memory_format.has_value()) {
+    PyTuple_SET_ITEM(tuple.get(), 3, torch::utils::getTHPMemoryFormat(opt_memory_format.value()));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 3, Py_None);
+  }
+  return tuple.release();
+  END_HANDLE_TH_ERRORS
+}
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef nn_functions[] = {
+  {"_parse_to", castPyCFunctionWithKeywords(THPVariable__parse_to),
+    METH_VARARGS | METH_KEYWORDS, nullptr},
+  ${py_method_defs}
+  {NULL}
+};
+
+void initNNFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._nn",
+     NULL,
+     -1,
+     nn_functions
+  };
+  PyObject* nn = PyModule_Create(&def);
+  THPNNVariableFunctionsModule = nn;
+  if (!nn) {
+    throw python_error();
+  }
+  // steals a reference to nn
+  if (PyModule_AddObject(module, "_nn", nn) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e721644565a8c47a0c2081050907008e4b2055c1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.cpp
@@ -0,0 +1,52 @@
+#include <Python.h>
+
+#include <vector>
+#include <map>
+#include <string>
+
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/Exceptions.h"
+
+namespace torch { namespace autograd { namespace generated {
+
+${py_return_types}
+
+}}}
+
+namespace torch::autograd {
+
+static void addReturnType(
+    PyObject* module,
+    const char* name,
+    PyTypeObject* type) {
+  // hold onto the TypeObject for the unlikely case of user
+  // deleting or overriding it.
+  Py_INCREF(type);
+  if (PyModule_AddObject(
+          module,
+          name,
+          (PyObject*)type) != 0) {
+    Py_DECREF(type);
+    throw python_error();
+  }
+}
+
+void initReturnTypes(PyObject* module) {
+  static struct PyModuleDef def = {
+      PyModuleDef_HEAD_INIT, "torch._C._return_types", nullptr, -1, {}};
+  PyObject* return_types_module = PyModule_Create(&def);
+  if (!return_types_module) {
+    throw python_error();
+  }
+
+  ${py_return_types_registrations}
+
+  // steals a reference to return_types on success
+  if (PyModule_AddObject(module, "_return_types", return_types_module) != 0) {
+    Py_DECREF(return_types_module);
+    throw python_error();
+  }
+}
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.h b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..24c18b92ee7308b3a9ff556ab890a26674913302
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.h
@@ -0,0 +1,14 @@
+#pragma once
+
+namespace torch {
+namespace autograd {
+namespace generated {
+
+${py_return_types_declarations}
+
+}
+
+void initReturnTypes(PyObject* module);
+
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_sparse_functions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_sparse_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e565b71f76082b3946b74d645124c3a8b30fbef3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_sparse_functions.cpp
@@ -0,0 +1,67 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_sparse_functions.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Scalar;
+using at::ScalarType;
+using at::MemoryFormat;
+using at::Generator;
+using at::IntArrayRef;
+using at::TensorList;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef sparse_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPSparseVariableFunctionsModule = NULL;
+
+void initSparseFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._sparse",
+     NULL,
+     -1,
+     sparse_functions
+  };
+  PyObject* sparse = PyModule_Create(&def);
+  THPSparseVariableFunctionsModule = sparse;
+  if (!sparse) {
+    throw python_error();
+  }
+  // steals a reference to sparse
+  if (PyModule_AddObject(module, "_sparse", sparse) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_special_functions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_special_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..beeac9258b891d6ed1ab1abf221acf04e2f5b8b5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_special_functions.cpp
@@ -0,0 +1,79 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_special_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using torch::utils::check_out_type_matches;
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef special_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPSpecialVariableFunctionsModule = NULL;
+
+void initSpecialFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._special",
+     NULL,
+     -1,
+     special_functions
+  };
+  PyObject* special = PyModule_Create(&def);
+  THPSpecialVariableFunctionsModule = special;
+  if (!special) {
+    throw python_error();
+  }
+  // steals a reference to special
+  if (PyModule_AddObject(module, "_special", special) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_torch_functions.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_torch_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..900bd621bb5c6914c13c5bdd52bfe1c121640fd3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_torch_functions.cpp
@@ -0,0 +1,93 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+// Python bindings for torch.* functions implemented through ATen.
+//
+// The functions are bound as static methods on a class
+// torch._C._VariableFunctions which is also aliased as Variable._torch
+// and also copied into 'torch' module.
+
+#include <Python.h>
+
+// Undefine the copysign macro so that at::copysign works as intended with MSVC
+// https://github.com/python/cpython/blob/c60394c7fc9cc09b16e9675a3eeb5844b6d8523f/PC/pyconfig.h#L196
+#ifdef _MSC_VER
+#undef copysign
+#endif // _MSC_VER
+
+#include "torch/csrc/autograd/python_torch_functions.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/tensor_layouts.h"
+#include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/utils/tensor_numpy.h"
+#include "torch/csrc/jit/frontend/tracer.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+#include <functional>
+#include <initializer_list>
+#include <stdexcept>
+#include <utility>
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+using at::ArrayRef;
+
+using torch::utils::check_out_type_matches;
+using namespace torch::autograd::utils;
+
+// NOTE: See [Sharded File] comment in VariableType
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef torch_functions_shard[] = {
+  ${py_method_defs}
+};
+
+void gatherTorchFunctions${shard_id}(std::vector<PyMethodDef> &torch_functions) {
+  constexpr size_t num_functions = sizeof(torch_functions_shard) / sizeof(torch_functions_shard[0]);
+  torch_functions.insert(
+    torch_functions.end(),
+    torch_functions_shard,
+    torch_functions_shard + num_functions);
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_variable_methods.cpp b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_variable_methods.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7940cec894d9a3bf107f44f7a7de01a5343ff68c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/python_variable_methods.cpp
@@ -0,0 +1,1279 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include <Python.h>
+
+// Undefine the copysign macro so that at::copysign works as intended with MSVC
+// https://github.com/python/cpython/blob/c60394c7fc9cc09b16e9675a3eeb5844b6d8523f/PC/pyconfig.h#L196
+#ifdef _MSC_VER
+#undef copysign
+#endif // _MSC_VER
+
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/Size.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/utils/error_messages.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/jit/frontend/tracer.h"
+#ifdef USE_CUDA
+#include "torch/csrc/cuda/Event.h"
+#endif
+#include "torch/csrc/utils/device_lazy_init.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/python_numbers.h"
+#include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/python_tuples.h"
+#include "torch/csrc/utils/tensor_apply.h"
+#include "torch/csrc/utils/tensor_list.h"
+#include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/utils/tensor_numpy.h"
+#include "torch/csrc/utils/tensor_types.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+
+#include <ATen/core/Tensor.h>
+#include <ATen/FuncTorchTLS.h>
+#include "c10/util/Optional.h"
+#include "c10/core/Stream.h"
+
+#include <stdexcept>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#include <ATen/ops/_local_scalar_dense.h>
+#endif
+
+using at::DeviceGuard;
+using at::device_of;
+using at::OptionalDeviceGuard;
+using at::Backend;
+using at::Scalar;
+using at::ScalarType;
+using at::Tensor;
+using c10::Stream;
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+static PyObject * THPVariable__is_view(PyObject *self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "_is_view", args);
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  if (self_.is_view()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object bc no support for first-class functions in native_functions.yaml
+// See: ATen/native/README.md for more context
+static PyObject * THPVariable_apply_(PyObject* self, PyObject* arg)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    auto args = py::make_tuple(py::handle(arg));
+    return handle_torch_function(self, "apply_", args.ptr());
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  if (self_.requires_grad()) {
+    throw std::runtime_error(
+        "Can't call apply_() on Variable that requires grad. Use "
+        "var.detach().apply_() instead.");
+  }
+  return THPVariable_Wrap(torch::utils::apply_(self_, arg));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_size(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "size(int64_t? dim=None)",
+    "size(Dimname dim)",
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+  if (r.idx == 0) {
+    if (!r.toInt64Optional(0).has_value()) {
+      return THPSize_NewFromSymSizes(self_);
+    }
+    if (jit::tracer::isTracing()) {
+      // will error out if a tensor has symints
+      return wrap(jit::tracer::getSizeOf(self_, r.toInt64(0)));
+    } else {
+      return torch::toPyObject(self_.sym_size(r.toInt64(0)));
+    }
+  } else if (r.idx == 1) {
+    if (jit::tracer::isTracing()) {
+      TORCH_INTERNAL_ASSERT(false, "NYI: Named tensors w/ JIT");
+    }
+    return wrap(self_.size(r.dimname(0)));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_stride(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "stride(int64_t? dim=None)",
+    "stride(Dimname dim)",
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  if (r.idx == 0) {
+    if (r.toInt64Optional(0).has_value()) {
+      return torch::toPyObject(self_.sym_stride(r.toInt64(0)));
+    }
+    // yes, this is called strides in ATen.
+    at::SymIntArrayRef strides = self_.sym_strides();
+    // we can't do the normal wrapping here because IntArrayRef maps to both
+    // torch.Size and tuple in python
+    // TODO: consider factoring this out
+    THPObjectPtr tuple(PyTuple_New(strides.size()));
+    if (!tuple) throw python_error();
+    for (size_t i = 0; i != strides.size(); i++) {
+      PyObject* s = torch::toPyObject(strides[i]);
+      if (!s) throw python_error();
+      PyTuple_SET_ITEM(tuple.get(), i, s);
+    }
+    return tuple.release();
+  } else if (r.idx == 1) {
+    return wrap(self_.stride(r.dimname(0)));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object to avoid dispatch overhead
+static PyObject * THPVariable_get_device(PyObject* self_, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self_)) {
+    return handle_torch_function(self_, "get_device", args, nullptr);
+  }
+  auto& self = THPVariable_Unpack(self_);
+  return wrap(self.get_device());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_has_names(PyObject* self_, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self_)) {
+    return handle_torch_function(self_, "has_names", args);
+  }
+  auto& self = THPVariable_Unpack(self_);
+  return wrap(self.has_names());
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object to avoid dispatch overhead
+static PyObject * THPVariable_data_ptr(PyObject* self_, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self_)) {
+    return handle_torch_function(self_, "data_ptr", args);
+  }
+  auto& self = THPVariable_Unpack(self_);
+  return wrap(self.data_ptr());
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object to avoid dispatch overhead
+static PyObject * THPVariable_storage_offset(PyObject* self_, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self_)) {
+    return handle_torch_function(self_, "storage_offset");
+  }
+  auto& self = THPVariable_Unpack(self_);
+  return py::cast(self.sym_storage_offset()).release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object to avoid dispatch overhead
+static PyObject * THPVariable_dim(PyObject* self, PyObject* args)
+{
+   HANDLE_TH_ERRORS
+   if (check_has_torch_function(self)) {
+     return handle_torch_function(self, "dim", args);
+   }
+   auto& self_ = THPVariable_Unpack(self);
+   return THPUtils_packInt64(self_.dim());
+   END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object to avoid dispatch overhead
+static PyObject * THPVariable_numel(PyObject* self, PyObject* args)
+{
+   HANDLE_TH_ERRORS
+   if (check_has_torch_function(self)) {
+     return handle_torch_function(self, "numel", args);
+   }
+   auto& self_ = THPVariable_Unpack(self);
+   if (jit::tracer::isTracing()) {
+     return wrap(jit::tracer::getNumelOf(self_));
+   } else {
+     return py::cast(self_.sym_numel()).release().ptr();
+   }
+   END_HANDLE_TH_ERRORS
+}
+
+static Tensor dispatch_contiguous(const Tensor & self, at::MemoryFormat memory_format) {
+  pybind11::gil_scoped_release no_gil;
+  OptionalDeviceGuard device_guard(device_of(self));
+  return self.contiguous(memory_format);
+}
+
+static PyObject * THPVariable_contiguous(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "contiguous(*, MemoryFormat memory_format=contiguous_format)",
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto& self_ = THPVariable_Unpack(self);
+  auto memory_format = r.memoryformat(0);
+  // avoids touching the GIL or current device if self is already contiguous
+  if (self_.is_contiguous(memory_format)) {
+    // NOTE: this logic is duplicated from VariableType.cpp. Since we need to
+    // record this call to contiguous() in the trace regardless of whether
+    // we actually call contiguous here, we need to record this information
+    // manually.
+    if (jit::tracer::isTracing()) {
+      auto tracer_state = jit::tracer::getTracingState();
+      auto op_name = c10::Symbol::fromQualString("aten::contiguous");
+      auto node = tracer_state->createNode(op_name, /*num_outputs=*/0);
+      jit::tracer::recordSourceLocation(node);
+      jit::tracer::addInputs(node, "self", self_);
+      jit::tracer::addInputs(node, "memory_format", memory_format);
+      tracer_state->insertNode(node);
+      jit::tracer::addOutput(node, self_);
+    }
+    Py_INCREF(self);
+    return self;
+  }
+  return THPVariable_Wrap(dispatch_contiguous(self_, memory_format));
+  END_HANDLE_TH_ERRORS
+}
+
+static Tensor dispatch_copy_(const Tensor & self, const Tensor & other, bool non_blocking) {
+  pybind11::gil_scoped_release no_gil;
+  OptionalDeviceGuard device_guard(device_of(self));
+  return self.copy_(other, non_blocking);
+}
+
+ static PyObject * THPVariable_copy_(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "copy_(Tensor other, bool non_blocking=False)",
+    "copy_(Tensor other, bool async=False)|deprecated"
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  return THPVariable_Wrap(dispatch_copy_(self_, r.tensor(0), r.toBool(1)));
+  END_HANDLE_TH_ERRORS
+}
+
+template<typename T>
+static T dispatch_to(const Tensor & self) {
+  pybind11::gil_scoped_release no_gil;
+  OptionalDeviceGuard device_guard(device_of(self));
+  TORCH_CHECK_VALUE(self.sym_numel() == 1, "only one element tensors can be converted to Python scalars");
+  return self.template item<T>();
+}
+
+static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "__float__", args);
+  }
+  jit::tracer::warn("Converting a tensor to a Python float", jit::tracer::WARN_PYTHON_DATAFLOW);
+  auto& self_ = THPVariable_Unpack(self);
+  return wrap(dispatch_to<double>(self_));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_complex_scalar(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "__complex__", args);
+  }
+  jit::tracer::warn("Converting a tensor to a Python complex", jit::tracer::WARN_PYTHON_DATAFLOW);
+  auto& self_ = THPVariable_Unpack(self);
+  return wrap(dispatch_to<c10::complex<double>>(self_));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "__int__", args);
+  }
+  jit::tracer::warn("Converting a tensor to a Python integer", jit::tracer::WARN_PYTHON_DATAFLOW);
+  auto& self_ = THPVariable_Unpack(self);
+  if (isFloatingType(self_.scalar_type())) {
+    // we can't dispatch to item<int64_t> here because we want to avoid ATen overflow checks;
+    // the python integral type (long in python2) can't overflow.
+    return THPUtils_packDoubleAsInt(dispatch_to<double>(self_));
+  } else {
+    return wrap(dispatch_to<int64_t>(self_));
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+// This is the __index__ function in Python which is similar to __int__, but
+// called when used as a slice.
+static PyObject * THPVariable_index_scalar(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "__index__", args);
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  // TODO: change the condition to `self_.dim() != 0` once we expose scalars
+  // in PyTorch.
+  if (!isIntegralType(self_.scalar_type(), /*includeBool=*/true) || self_.sym_numel() != 1) {
+    throw TypeError("only integer tensors of a single element can be converted to an index");
+  }
+  return wrap(dispatch_to<int64_t>(self_));
+  END_HANDLE_TH_ERRORS
+}
+
+static Tensor dispatch_invert(const Tensor & self) {
+  pybind11::gil_scoped_release no_gil;
+  OptionalDeviceGuard device_guard(device_of(self));
+  return self.bitwise_not();
+}
+
+static PyObject * THPVariable_invert(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "__invert__", args);
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  if (!isIntegralType(self_.scalar_type(), /*includeBool=*/true)) {
+    throw TypeError("~ (operator.invert) is only implemented on integer and Boolean-type tensors");
+  }
+  return THPVariable_Wrap(dispatch_invert(self_));
+  END_HANDLE_TH_ERRORS
+}
+
+static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+  pybind11::gil_scoped_release no_gil;
+  // NOTE: this is where we record aten::to in the graph during tracing. However, the behavior of aten::to
+  // is different with respect to TensorOptions fields that are not present: aten::to inherits fields that
+  // are missing from the self argument while the tracer assumes that they should be populated with the
+  // default values (eg. float for scalar type). By explicitly copying over the tensor options here we fully
+  // specify all tensor options and thus record the proper trace
+  return self.to(self.options().device(device).memory_format(optional_memory_format), non_blocking, copy);
+}
+
+static Tensor dispatch_to(const Tensor & self, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+  pybind11::gil_scoped_release no_gil;
+  return self.to(self.options().memory_format(optional_memory_format), non_blocking, copy);
+}
+
+static Tensor dispatch_to(const Tensor & self, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+  pybind11::gil_scoped_release no_gil;
+  // TODO: Make this call the TensorOptions version, maybe?
+  return self.to(dtype, non_blocking, copy, optional_memory_format);
+}
+
+static Tensor dispatch_to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+  pybind11::gil_scoped_release no_gil;
+  // TODO: Make this call the TensorOptions version, maybe?
+  return self.to(device, dtype, non_blocking, copy, optional_memory_format);
+}
+
+static PyObject * THPVariable_cpu(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+   HANDLE_TH_ERRORS
+   static PythonArgParser parser({
+     "cpu(*, MemoryFormat? memory_format=None)"
+   });
+   auto& self_ = THPVariable_Unpack(self);
+   ParsedArgs<1> parsed_args;
+   auto r = parser.parse(self, args, kwargs, parsed_args);
+
+   if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+    }
+
+   auto opt_memory_format = r.memoryformatOptional(0);
+   return THPVariable_Wrap(dispatch_to(self_, at::Device(at::DeviceType::CPU), false, false, opt_memory_format));
+   END_HANDLE_TH_ERRORS
+}
+
+static Tensor dispatch_nonzero(const Tensor & self) {
+  pybind11::gil_scoped_release no_gil;
+  OptionalDeviceGuard device_guard(device_of(self));
+  return self.nonzero();
+}
+
+static std::vector<Tensor> dispatch_nonzero_numpy(const Tensor & self) {
+  pybind11::gil_scoped_release no_gil;
+  OptionalDeviceGuard device_guard(device_of(self));
+  return self.nonzero_numpy();
+}
+
+static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "nonzero()",
+    "nonzero(*, bool as_tuple)",
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  if (r.idx == 0 || (r.idx == 1 && !r.toBool(0))) {
+    return wrap(dispatch_nonzero(self_));
+  } else {
+    return wrap(dispatch_nonzero_numpy(self_));
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "cuda(Device? device=None, bool non_blocking=False, *, MemoryFormat? memory_format=None)",
+    "cuda(Device? device=None, bool async=False, *, MemoryFormat? memory_format=None)|deprecated"
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto device = r.isNone(0) ? at::Device(at::DeviceType::CUDA) : r.device(0);
+  auto opt_memory_format = r.memoryformatOptional(2);
+  TORCH_CHECK(device.is_cuda(), "Invalid device, must be cuda device");
+  torch::utils::device_lazy_init(at::kCUDA);
+  return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_xpu(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "xpu(Device? device=None, bool non_blocking=False, *, MemoryFormat? memory_format=None)",
+    "xpu(Device? device=None, bool async=False, *, MemoryFormat? memory_format=None)|deprecated"
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if (r.has_torch_function()) {
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto device = r.isNone(0) ? at::Device(at::DeviceType::XPU) : r.device(0);
+  auto opt_memory_format = r.memoryformatOptional(2);
+  TORCH_CHECK(device.is_xpu(), "Invalid device, must be xpu device");
+  torch::utils::device_lazy_init(at::kXPU);
+  return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_ipu(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "ipu(Device? device=None, bool non_blocking=False, *, MemoryFormat? memory_format=None)",
+    "ipu(Device? device=None, bool async=False, *, MemoryFormat? memory_format=None)|deprecated"
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if (r.has_torch_function()) {
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto device = r.isNone(0) ? at::Device(at::DeviceType::IPU) : r.device(0);
+  auto opt_memory_format = r.memoryformatOptional(2);
+  TORCH_CHECK(device.is_ipu(), "Invalid device, must be ipu device");
+  return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType, c10::optional<c10::MemoryFormat> optional_memory_format) {
+  HANDLE_TH_ERRORS
+  auto& self_ = THPVariable_Unpack(self);
+  return THPVariable_Wrap(dispatch_to(self_, scalarType, false, false, optional_memory_format));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_byte(PyObject* self, PyObject* args, PyObject* kwargs)  {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "byte(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::Byte, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_char(PyObject* self, PyObject* args, PyObject* kwargs)  {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "char(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::Char, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_double(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "double(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::Double, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_float(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "float(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::Float, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_cdouble(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "cdouble(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::ComplexDouble, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_cfloat(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "cfloat(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::ComplexFloat, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_half(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "half(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::Half, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_int(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "int(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::Int, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_long(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "long(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::Long, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_short(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "short(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::Short, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_bool(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "bool(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::Bool, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_bfloat16(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "bfloat16(*, MemoryFormat? memory_format=None)"
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto opt_memory_format = r.memoryformatOptional(0);
+  return THPVariable_to_type(self, ScalarType::BFloat16, opt_memory_format);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_element_size(PyObject* self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "element_size", args);
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  return THPUtils_packInt64(self_.element_size());
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object bc PyObjects not declarable in native_functions.yaml
+// See: ATen/native/README.md for more context
+static PyObject * THPVariable_numpy(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "numpy(*, bool force=False)"
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if (r.has_torch_function()) {
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  jit::tracer::warn("Converting a tensor to a NumPy array", jit::tracer::WARN_PYTHON_DATAFLOW);
+  return torch::utils::tensor_to_numpy(self_, r.toBool(0));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_requires_grad_(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "requires_grad_(bool requires_grad=True)",
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  // temporary hack to improve functorch UX.
+  const auto& functorch_tls = at::functorch::functorchTLSAccessor();
+  if (functorch_tls) {
+    functorch_tls->checkSupportsInplaceRequiresGrad();
+  }
+
+  auto requires_grad = r.toBool(0);
+  // should we throw if requires_grad is true?  var.requires_grad = True throws here
+  // but it's nice to let this be a no-op.
+  if (!self_.is_leaf() && !requires_grad) {
+    throw std::runtime_error(autograd::utils::requires_grad_leaf_error(requires_grad));
+  }
+  if (requires_grad && ! isDifferentiableType(at::typeMetaToScalarType(self_.dtype()))) {
+    throw std::runtime_error("only Tensors of floating point dtype can require gradients");
+  }
+  self_.set_requires_grad(requires_grad);
+  return THPVariable_Wrap(self_);
+  END_HANDLE_TH_ERRORS
+}
+
+inline bool dispatch_is_contiguous(const Tensor & self, MemoryFormat memory_format) {
+  return self.is_contiguous(memory_format);
+}
+
+// implemented on the python object to avoid dispatch overhead
+static PyObject * THPVariable_is_contiguous(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "is_contiguous(*, MemoryFormat memory_format=contiguous_format)",
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(self_, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self_, args, kwargs, PyObject_Type(self_), "torch.Tensor");
+  }
+
+  auto memory_format = r.memoryformat(0);
+  auto& self = THPVariable_Unpack(self_);
+  return wrap(dispatch_is_contiguous(self, memory_format));
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object to avoid dispatch overhead
+static PyObject * THPVariable_item(PyObject* self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "item", args);
+  }
+  jit::tracer::warn("Converting a tensor to a Python number", jit::tracer::WARN_PYTHON_DATAFLOW);
+  auto& self_ = THPVariable_Unpack(self);
+  auto dispatch_item_ = [](const Tensor& self) -> at::Scalar {
+    pybind11::gil_scoped_release no_gil;
+    return self.item();
+  };
+  return py::cast(dispatch_item_(self_)).release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object bc no support for first class functions in native_functions.yaml
+// See: ATen/native/README.md for more context
+static PyObject * THPVariable_map_(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({ "map_(Tensor other, PyObject* callable)" });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  Variable other = r.tensor(0);
+  if (self_.requires_grad() || other.requires_grad()) {
+    throw std::runtime_error(
+        "Can't call map_() on Variable that requires grad. Use "
+        "var.detach().map_() instead.");
+  }
+  TORCH_CHECK(
+      !self_.unsafeGetTensorImpl()->is_python_dispatch() && !other.unsafeGetTensorImpl()->is_python_dispatch(),
+      ".map_ is not supported for tensor subclasses.");
+
+  return THPVariable_Wrap(torch::utils::map_(self_, other, r.pyobject(1)));
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object bc no support for first class functions in native_functions.yaml
+// See: ATen/native/README.md for more context
+static PyObject * THPVariable_map2_(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({ "map2_(Tensor x, Tensor y, PyObject* callable)" });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  Variable x = r.tensor(0);
+  Variable y = r.tensor(1);
+  if (self_.requires_grad() || x.requires_grad() || y.requires_grad()) {
+    throw std::runtime_error(
+        "Can't call map2_() on Variable that requires grad. Use "
+        "var.detach().map2_() instead.");
+  }
+  TORCH_CHECK(
+      !x.unsafeGetTensorImpl()->is_python_dispatch() && !y.unsafeGetTensorImpl()->is_python_dispatch(),
+      ".map2_ is not supported for tensor subclasses.");
+  return THPVariable_Wrap(torch::utils::map2_(self_, x, y, r.pyobject(2)));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_new(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "new", args, kwargs);
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  OptionalDeviceGuard device_guard(device_of(self_));
+  return THPVariable_Wrap(torch::utils::legacy_tensor_new(legacyExtractDispatchKey(self_), self_.scalar_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_new_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "new_tensor", args, kwargs);
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  OptionalDeviceGuard device_guard(device_of(self_));
+  return THPVariable_Wrap(torch::utils::new_tensor(legacyExtractDispatchKey(self_), self_.scalar_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_storage(PyObject* self, PyObject* arg)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "untyped_storage");
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  return createPyObject(self_.storage());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "to(Device device=None, ScalarType dtype=None, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+    "to(ScalarType dtype, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+    "to(Tensor tensor, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+  });
+  ParsedArgs<5> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+  if (r.has_torch_function()) {
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+  auto parsed = parse_to_conversion(r, /*allow_copy*/ true);
+  auto& device = std::get<0>(parsed);
+  auto& scalarType = std::get<1>(parsed);
+  auto non_blocking = std::get<2>(parsed);
+  auto copy = std::get<3>(parsed);
+  auto opt_memory_format = std::get<4>(parsed);
+  auto& self_ = THPVariable_Unpack(self);
+  torch::utils::maybe_initialize_device(device);
+  if (device && device->is_privateuseone()) {
+    at::globalContext().lazyInitPrivateUse1();
+  }
+  if (!device && !scalarType && !copy && !opt_memory_format.has_value()) {
+    Py_INCREF(self);
+    return self;
+  } else if (!device && !scalarType) {
+    return THPVariable_Wrap(
+        dispatch_to(self_, non_blocking, copy, opt_memory_format));
+  } else if (!device) {
+    return THPVariable_Wrap(dispatch_to(self_, *scalarType, non_blocking, copy, opt_memory_format));
+  } else if (!scalarType) {
+    return THPVariable_Wrap(dispatch_to(self_, *device, non_blocking, copy, opt_memory_format));
+  } else {
+    return THPVariable_Wrap(dispatch_to(self_, *device, *scalarType, non_blocking, copy, opt_memory_format));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on the python object b/c arbitrarily nested list not declarable in native_functions.yaml
+// See: ATen/native/README.md for more context
+static PyObject * THPVariable_tolist(PyObject* self, PyObject* args)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function(self)) {
+    return handle_torch_function(self, "tolist", args);
+  }
+  jit::tracer::warn("Converting a tensor to a Python list", jit::tracer::WARN_PYTHON_DATAFLOW);
+  auto self_ = THPVariable_Unpack(self);
+  return torch::utils::tensor_to_list(self_);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "type(PyObject* dtype=None, bool non_blocking=False, *, MemoryFormat? memory_format=None)",
+    "type(PyObject* dtype=None, bool async=False, *, MemoryFormat? memory_format=None)|deprecated"
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  if (r.isNone(0)) {
+    return THPUtils_packString(torch::utils::options_to_string(self_.options()));
+  }
+  auto obj = r.pyobject(0);
+  auto opt_memory_format = r.memoryformatOptional(2);
+  std::string type_name;
+  bool is_dtype = false;
+  if (PyType_Check(obj)) {
+    if (obj == THPVariableClass) {
+      type_name = "torch.Tensor";
+    } else {
+      type_name = ((PyTypeObject*)obj)->tp_name;
+    }
+  } else if (THPUtils_checkString(obj)) {
+    type_name = THPUtils_unpackString(obj);
+  } else if (THPDtype_Check(obj)) {
+    is_dtype = true;
+  } else {
+    throw TypeError("dtype must be a type, str, or dtype object");
+  }
+  ScalarType scalar_type;
+  Device device = self_.device();
+  if (is_dtype) {
+    scalar_type = r.scalartype(0);
+    return THPVariable_Wrap(dispatch_to(self_, scalar_type, /*non_blocking=*/ r.toBool(1), /*copy=*/ false, opt_memory_format));
+  }
+  at::TensorOptions options = torch::utils::options_from_string(type_name);
+  scalar_type = at::typeMetaToScalarType(options.dtype());
+  auto device_type = options.device().type();
+  if (device_type != device.type()) {
+    device = at::Device(device_type);
+  }
+  torch::utils::maybe_initialize_device(device);
+  if (device.is_privateuseone()) {
+    at::globalContext().lazyInitPrivateUse1();
+  }
+  return THPVariable_Wrap(dispatch_to(self_, device, scalar_type, /*non_blocking=*/ r.toBool(1), /*copy=*/ false, opt_memory_format));
+  END_HANDLE_TH_ERRORS
+}
+
+// generated methods start here
+
+${py_methods}
+
+static PyObject * THPVariable_bool_scalar(PyObject* self, PyObject* args) {
+  if (check_has_torch_function(self)) {
+    HANDLE_TH_ERRORS
+    return handle_torch_function(self, "__bool__", args);
+    END_HANDLE_TH_ERRORS
+  }
+  jit::tracer::warn("Converting a tensor to a Python boolean", jit::tracer::WARN_PYTHON_DATAFLOW);
+  return THPVariable_is_nonzero(self, args);
+}
+
+// Wrapper converts a raised TypeError into returning NotImplemented
+// Used to implement binary arithmetic operators
+template <PyObject* (*Func)(PyObject*, PyObject*, PyObject*)>
+static PyObject * TypeError_to_NotImplemented_(PyObject* self, PyObject* args, PyObject* kwargs) {
+
+  PyObject* ret = Func(self, args, kwargs);
+  if (!ret && PyErr_ExceptionMatches(PyExc_TypeError)) {
+    PyErr_Clear();
+    Py_INCREF(Py_NotImplemented);
+    ret = Py_NotImplemented;
+  }
+  return ret;
+}
+
+// set_ has to be defined in the template because the c10::Storage object
+// does not have a type, and we need to make sure the Python storage object's
+// type matches the tensor's type
+static PyObject* THPVariable_set_(
+    PyObject* self_,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  const Tensor& self = THPVariable_Unpack(self_);
+  static PythonArgParser parser(
+      {
+          "set_()",
+          "set_(Storage source)",
+          "set_(Storage source, SymInt storage_offset, SymIntArrayRef size, SymIntArrayRef stride=None)",
+          "set_(Tensor source)",
+          "set_(Tensor source, SymInt storage_offset, SymIntArrayRef size, SymIntArrayRef stride=None)",
+      },
+      /*traceable=*/false);
+
+  ParsedArgs<4> parsed_args;
+  auto _r = parser.parse(args, kwargs, parsed_args);
+
+  switch (_r.idx) {
+    case 0: {
+      // aten::set_(Tensor(a!) self) -> Tensor(a!)
+      auto dispatch_set_ = [](const Tensor& self) -> Tensor {
+        pybind11::gil_scoped_release no_gil;
+        return self.set_();
+      };
+      return wrap(dispatch_set_(self));
+    }
+    case 1: {
+      // aten::set_.source_Storage(Tensor(a!) self, Storage source) ->
+      // Tensor(a!)
+      at::ScalarType storage_scalar_type;
+      bool is_typed_storage = true;
+      at::Storage storage = _r.storage(0, storage_scalar_type, is_typed_storage);
+      TORCH_CHECK(storage_scalar_type == self.dtype() || !is_typed_storage,
+        "Expected a Storage of type ", self.dtype(),
+        " or an UntypedStorage, but got type ", storage_scalar_type,
+        " for argument 1 'storage'");
+      auto dispatch_set_ = [](const Tensor& self, Storage source) -> Tensor {
+        pybind11::gil_scoped_release no_gil;
+        return self.set_(source);
+      };
+      return wrap(dispatch_set_(self, storage));
+    }
+    case 2: {
+      // aten::set_.source_Storage_storage_offset(Tensor(a!) self, Storage
+      // source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
+      at::ScalarType storage_scalar_type;
+      bool is_typed_storage = true;
+      at::Storage storage = _r.storage(0, storage_scalar_type, is_typed_storage);
+      TORCH_CHECK(storage_scalar_type == self.dtype() || !is_typed_storage,
+        "Expected a Storage of type ", self.dtype(),
+        " or an UntypedStorage, but got type ", storage_scalar_type,
+        " for argument 1 'storage'");
+      auto dispatch_set_ = [](const Tensor& self,
+                              Storage source,
+                              c10::SymInt storage_offset,
+                              c10::SymIntArrayRef size,
+                              c10::SymIntArrayRef stride) -> Tensor {
+        pybind11::gil_scoped_release no_gil;
+        return self.set__symint(source, storage_offset, size, stride);
+      };
+      return wrap(dispatch_set_(
+          self, storage, _r.toSymInt(1), _r.symintlist(2), _r.symintlist(3)));
+    }
+    case 3: {
+      // aten::set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
+      auto dispatch_set_ = [](const Tensor& self, const Tensor& source) -> Tensor {
+        TORCH_CHECK(source.dtype() == self.dtype(), "Could not set tensor of type ", source.dtype(), " to a tensor of type ", self.dtype());
+        pybind11::gil_scoped_release no_gil;
+        return self.set_(source);
+      };
+      return wrap(dispatch_set_(self, _r.tensor(0)));
+    }
+    case 4: {
+      // aten::set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor
+      // source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
+      at::Tensor storage = _r.tensor(0);
+      auto dispatch_set_ = [](const Tensor& self,
+                              const Tensor& source,
+                              c10::SymInt storage_offset,
+                              c10::SymIntArrayRef size,
+                              c10::SymIntArrayRef stride) -> Tensor {
+        pybind11::gil_scoped_release no_gil;
+        return self.set__symint(source, storage_offset, size, stride);
+      };
+      return wrap(dispatch_set_(
+          self, storage, _r.toSymInt(1), _r.symintlist(2), _r.symintlist(3)));
+    }
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// XXX: ops that are bound here are not exposed to the C++ api nor the JIT.
+// Any new ops added here should be accompanied with a comment why they are not
+// being registered through native_functions.yaml, and be tagged cpp / JIT
+PyMethodDef variable_methods[] = {
+  // These magic methods are all implemented on python object to wrap NotImplementedError
+  {"__add__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_add>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__radd__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_add>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__iadd__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_add_>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__rmul__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_mul>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__mul__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_mul>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__imul__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_mul_>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__sub__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_sub>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__isub__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_sub_>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__div__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_div>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__truediv__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_div>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__floordiv__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_floor_divide>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__idiv__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_div_>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__ifloordiv__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_floor_divide_>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__mod__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_remainder>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__imod__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_remainder_>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__eq__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_eq>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__ne__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_ne>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__lt__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_lt>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__le__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_le>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__gt__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_gt>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__ge__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_ge>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__rand__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_bitwise_and>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__ror__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_bitwise_or>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__rxor__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_bitwise_xor>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__bool__", THPVariable_bool_scalar, METH_NOARGS, NULL},
+  {"__float__", THPVariable_float_scalar, METH_NOARGS, NULL},
+  {"__complex__", THPVariable_complex_scalar, METH_NOARGS, NULL},
+  {"__int__", THPVariable_integral_scalar, METH_NOARGS, NULL},
+  {"__long__", THPVariable_integral_scalar, METH_NOARGS, NULL},
+  {"__index__", THPVariable_index_scalar, METH_NOARGS, NULL},
+  {"__nonzero__", THPVariable_bool_scalar, METH_NOARGS, NULL},
+  {"__invert__", THPVariable_invert, METH_NOARGS, NULL},
+  {"__matmul__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_matmul>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"_is_view", THPVariable__is_view, METH_NOARGS, NULL},
+  {"apply_", THPVariable_apply_, METH_O, NULL},
+  {"bfloat16", castPyCFunctionWithKeywords(THPVariable_bfloat16), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"byte", castPyCFunctionWithKeywords(THPVariable_byte), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"char", castPyCFunctionWithKeywords(THPVariable_char), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"contiguous", castPyCFunctionWithKeywords(THPVariable_contiguous), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"copy_", castPyCFunctionWithKeywords(THPVariable_copy_), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cpu", castPyCFunctionWithKeywords(THPVariable_cpu), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cuda", castPyCFunctionWithKeywords(THPVariable_cuda), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"xpu", castPyCFunctionWithKeywords(THPVariable_xpu), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"ipu", castPyCFunctionWithKeywords(THPVariable_ipu), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"data_ptr", THPVariable_data_ptr, METH_NOARGS, NULL},
+  {"dim", THPVariable_dim, METH_NOARGS, NULL},
+  {"has_names", THPVariable_has_names, METH_NOARGS, NULL},
+  {"double", castPyCFunctionWithKeywords(THPVariable_double), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cdouble", castPyCFunctionWithKeywords(THPVariable_cdouble), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"element_size", THPVariable_element_size, METH_NOARGS, NULL},
+  {"float", castPyCFunctionWithKeywords(THPVariable_float), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cfloat", castPyCFunctionWithKeywords(THPVariable_cfloat), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"get_device", THPVariable_get_device, METH_NOARGS, NULL},
+  {"bool", castPyCFunctionWithKeywords(THPVariable_bool), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"half", castPyCFunctionWithKeywords(THPVariable_half), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"int", castPyCFunctionWithKeywords(THPVariable_int), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"is_contiguous", castPyCFunctionWithKeywords(THPVariable_is_contiguous), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"item", THPVariable_item, METH_NOARGS, NULL},
+  {"long", castPyCFunctionWithKeywords(THPVariable_long), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"map_", castPyCFunctionWithKeywords(THPVariable_map_), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"map2_", castPyCFunctionWithKeywords(THPVariable_map2_), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"ndimension", THPVariable_dim, METH_NOARGS, NULL},
+  {"nelement", THPVariable_numel, METH_NOARGS, NULL},
+  {"new", castPyCFunctionWithKeywords(THPVariable_new), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"new_tensor", castPyCFunctionWithKeywords(THPVariable_new_tensor), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"nonzero", castPyCFunctionWithKeywords(THPVariable_nonzero), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"numel", THPVariable_numel, METH_NOARGS, NULL},
+  {"numpy", castPyCFunctionWithKeywords(THPVariable_numpy), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"requires_grad_", castPyCFunctionWithKeywords(THPVariable_requires_grad_), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"set_", castPyCFunctionWithKeywords(THPVariable_set_), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"short", castPyCFunctionWithKeywords(THPVariable_short), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"size", castPyCFunctionWithKeywords(THPVariable_size), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"untyped_storage", THPVariable_storage, METH_NOARGS, NULL},
+  {"storage_offset", THPVariable_storage_offset, METH_NOARGS, NULL},
+  {"stride", castPyCFunctionWithKeywords(THPVariable_stride), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"to", castPyCFunctionWithKeywords(THPVariable_to), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"tolist", THPVariable_tolist, METH_NOARGS, NULL},
+  {"type", castPyCFunctionWithKeywords(THPVariable_type), METH_VARARGS | METH_KEYWORDS, NULL},
+  ${py_method_defs}
+  {NULL}
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/variable_factories.h b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/variable_factories.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d37227c808ce51b219745876f09cb6cbc24218e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/packaged/autograd/templates/variable_factories.h
@@ -0,0 +1,135 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/grad_mode.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/core/MemoryFormat.h>
+#include <torch/csrc/api/include/torch/detail/TensorDataContainer.h>
+#include <torch/csrc/autograd/variable.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/from_blob.h>
+$ops_headers
+#endif
+
+#include <functional>
+#include <initializer_list>
+#include <utility>
+
+namespace torch {
+
+/// NOTE: Currently `torch::tensor(...)` doesn't support mixed data types
+/// (i.e. `torch::tensor({{bool, 2.0}})` doesn't work). We might be able to
+/// support it in the future by iterating over all sub-lists to find
+/// the largest data type that can represent all of the elements, or by using
+/// variadic templates.
+///
+/// NOTE: C++ `torch::tensor` with a floating-point type or an `at::ArrayRef` / `std::vector` /
+/// (nested) braced-init-list of floating-point types always produces a tensor of dtype
+/// `torch::get_default_dtype()`, matching Python `torch.tensor` behavior.
+///
+/// NOTE: C++ `torch::tensor` with an integer type or an `at::ArrayRef` / `std::vector` /
+/// (nested) braced-init-list of integer types always produces a tensor of dtype `at::kLong`
+/// (aka. int64_t), matching Python `torch.tensor` behavior.
+///
+/// NOTE: The following dtypes are not supported by `torch::tensor` currently:
+/// - `unsigned int`
+/// - `unsigned long int`
+/// - `unsigned long long int`
+/// - `long long int`
+inline at::Tensor tensor(detail::TensorDataContainer tensor_data_container, const at::TensorOptions& options = {}) {
+  return autograd::make_variable(
+    // note: we remove the requires_grad setting from the TensorOptions because
+    // it is ignored anyways (and we actually have an assertion that it isn't set
+    // which would fail otherwise). We handle requires_grad explicitly here
+    // instead of passing it through to the kernel.
+    tensor_data_container.convert_to_tensor(options.requires_grad(c10::nullopt)),
+    options.requires_grad());
+}
+
+/// A generic deleter function.
+using Deleter = std::function<void(void*)>;
+using at::MemoryFormat;
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor, `strides` the
+/// stride in each dimension. The `deleter` function (a
+/// `std::function<void(void*)>`) will be called on the `data` when the Tensor
+/// data would normally be deallocated. The `TensorOptions` specify additional
+/// configuration options for the returned tensor, such as what type to
+/// interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const Deleter& deleter,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, strides, deleter, options.requires_grad(c10::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor, `strides` the
+/// stride in each dimension. The `TensorOptions`
+/// specify additional configuration options for the returned tensor, such as
+/// what type to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, strides, options.requires_grad(c10::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor. The `deleter`
+/// (a `std::function<void(void*)>`) function will be called on the `data` when
+/// the Tensor data would normally be deallocated. The `TensorOptions` specify
+/// additional configuration options for the returned tensor, such as what type
+/// to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    const Deleter& deleter,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, deleter, options.requires_grad(c10::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor. The
+/// `TensorOptions` specify additional configuration options for the returned
+/// tensor, such as what type to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, options.requires_grad(c10::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+${function_definitions}
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torchgen/selective_build/__init__.py b/MLPY/Lib/site-packages/torchgen/selective_build/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchgen/selective_build/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/selective_build/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..077f6bc7211d4e9b3c5edad5e82b72bf92b08ced
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/selective_build/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/selective_build/__pycache__/operator.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/selective_build/__pycache__/operator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0af96cb6ed97e5dee6387a6816d29c3be3c4cd4a
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/selective_build/__pycache__/operator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/selective_build/__pycache__/selector.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/selective_build/__pycache__/selector.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89dbf558d1ab34fb98ea6d0009801e86b7b54976
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/selective_build/__pycache__/selector.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/selective_build/operator.py b/MLPY/Lib/site-packages/torchgen/selective_build/operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5acc7e075d7318165cca5d952de7fe9f3e6e00a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/selective_build/operator.py
@@ -0,0 +1,170 @@
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+
+# This class holds information about a single operator used to determine
+# the outcome of a selective/custom PyTorch build that doesn't include
+# registration code for all the supported operators. This is done to
+# reduce the size of the generated binary so that it can be deployed in
+# situations where binary size comes at a premium.
+#
+@dataclass(frozen=True)
+class SelectiveBuildOperator:
+    # The name of the operator. This includes the aten::, etc... prefix
+    # The operator name may or may not have the overload name. If this
+    # operator name does not specify an overload name, the way to determine
+    # if this entry refers to the family of operators with this base name
+    # or just the operator with this name is to look at the value of the
+    # 'include_all_overloads' flag in this class.
+    name: str
+
+    # True if this is a root operator (i.e. called directly from a
+    # TorchScript model, etc...). An operator is considered to be a
+    # root operator if it is called directly from any one of the models
+    # that this instance of the pytorch library was built for. Hence, it
+    # may not be a root operator in all of the models that are used in
+    # this instance of the pytorch library.
+    is_root_operator: bool
+
+    # Is this operator used for on-device training? If True, then we need to
+    # use the information to generate code in VariableType_N.cpp for registration
+    # of training related operators. Again, this is True if this operator
+    # is used for training in one or more models used by this instance of the
+    # pytorch library.
+    is_used_for_training: bool
+
+    # If True, it indicates that this operator instance (object) refers to an
+    # operator without the overload name and should apply to all overloads
+    # which have this operator name as the base name. This flag is applicable
+    # only for objects that have operator names without a DOT (period) character
+    # in them.
+    #
+    # Note: This flag is a temporary workaround to grandfather in the current
+    # static selective (custom) build mechanism, which largely ignores overload
+    # names when determining whether to select operators for registration
+    # purposes.
+    include_all_overloads: bool
+
+    # Debug Information at the operator level
+    _debug_info: Optional[Tuple[str, ...]]
+
+    @staticmethod
+    def from_yaml_dict(
+        op_name: str, op_info: Dict[str, object]
+    ) -> "SelectiveBuildOperator":
+        allowed_keys = {
+            "name",
+            "is_root_operator",
+            "is_used_for_training",
+            "include_all_overloads",
+            "debug_info",
+        }
+
+        if len(set(op_info.keys()) - allowed_keys) > 0:
+            raise Exception(
+                "Got unexpected top level keys: {}".format(
+                    ",".join(set(op_info.keys()) - allowed_keys),
+                )
+            )
+
+        if "name" in op_info:
+            assert op_name == op_info["name"]
+
+        is_root_operator = op_info.get("is_root_operator", True)
+        assert isinstance(is_root_operator, bool)
+
+        is_used_for_training = op_info.get("is_used_for_training", True)
+        assert isinstance(is_used_for_training, bool)
+
+        include_all_overloads = op_info.get("include_all_overloads", True)
+        assert isinstance(include_all_overloads, bool)
+
+        debug_info: Optional[Tuple[str, ...]] = None
+        if "debug_info" in op_info:
+            di_list = op_info["debug_info"]
+            assert isinstance(di_list, list)
+            debug_info = tuple(str(x) for x in di_list)
+
+        return SelectiveBuildOperator(
+            name=op_name,
+            is_root_operator=is_root_operator,
+            is_used_for_training=is_used_for_training,
+            include_all_overloads=include_all_overloads,
+            _debug_info=debug_info,
+        )
+
+    @staticmethod
+    def from_legacy_operator_name_without_overload(
+        name: str,
+    ) -> "SelectiveBuildOperator":
+        return SelectiveBuildOperator(
+            name=name,
+            is_root_operator=True,
+            is_used_for_training=True,
+            include_all_overloads=True,
+            _debug_info=None,
+        )
+
+    def to_dict(self) -> Dict[str, object]:
+        ret: Dict[str, object] = {
+            "is_root_operator": self.is_root_operator,
+            "is_used_for_training": self.is_used_for_training,
+            "include_all_overloads": self.include_all_overloads,
+        }
+        if self._debug_info is not None:
+            ret["debug_info"] = self._debug_info
+
+        return ret
+
+
+def merge_debug_info(
+    lhs: Optional[Tuple[str, ...]],
+    rhs: Optional[Tuple[str, ...]],
+) -> Optional[Tuple[str, ...]]:
+    # Ensure that when merging, each entry shows up just once.
+    if lhs is None and rhs is None:
+        return None
+
+    return tuple(set((lhs or ()) + (rhs or ())))
+
+
+def combine_operators(
+    lhs: "SelectiveBuildOperator", rhs: "SelectiveBuildOperator"
+) -> "SelectiveBuildOperator":
+    if str(lhs.name) != str(rhs.name):
+        raise Exception(
+            f"Expected both arguments to have the same name, but got '{str(lhs.name)}' and '{str(rhs.name)}' instead"
+        )
+
+    return SelectiveBuildOperator(
+        name=lhs.name,
+        # Consider this operator to be a root operator if it is a
+        # root operator in any of the models used in this instance of
+        # the pytorch library.
+        is_root_operator=lhs.is_root_operator or rhs.is_root_operator,
+        # Consider this operator to be a training operator if it is
+        # an operator used for training in any of the models used
+        # in this instance of the pytorch library.
+        is_used_for_training=lhs.is_used_for_training or rhs.is_used_for_training,
+        include_all_overloads=lhs.include_all_overloads or rhs.include_all_overloads,
+        _debug_info=merge_debug_info(lhs._debug_info, rhs._debug_info),
+    )
+
+
+def merge_operator_dicts(
+    lhs: Dict[str, SelectiveBuildOperator],
+    rhs: Dict[str, SelectiveBuildOperator],
+) -> Dict[str, SelectiveBuildOperator]:
+    operators: Dict[str, SelectiveBuildOperator] = {}
+    for op_name, op in list(lhs.items()) + list(rhs.items()):
+        new_op = op
+        if op_name in operators:
+            new_op = combine_operators(operators[op_name], op)
+
+        operators[op_name] = new_op
+
+    return operators
+
+
+def strip_operator_overload_name(op_name: str) -> str:
+    return op_name.split(".")[0]
diff --git a/MLPY/Lib/site-packages/torchgen/selective_build/selector.py b/MLPY/Lib/site-packages/torchgen/selective_build/selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..145b5d63bd47bc39ba645825e17400cf0e46344a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/selective_build/selector.py
@@ -0,0 +1,347 @@
+from collections import defaultdict
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Tuple
+
+import yaml
+
+from torchgen.model import NativeFunction
+from torchgen.selective_build.operator import (
+    merge_debug_info,
+    merge_operator_dicts,
+    SelectiveBuildOperator,
+    strip_operator_overload_name,
+)
+
+
+# A SelectiveBuilder holds information extracted from the selective build
+# YAML specification.
+#
+# It includes information about the build's selectivity, the debug_info
+# associated with this selective build (opaque string), and the set of
+# operators that should be included in the build.
+#
+@dataclass(frozen=True)
+class SelectiveBuilder:
+    # If true, then the build is not selective, and includes all
+    # operators.
+    include_all_operators: bool
+
+    # Debug Information at the selective/custom build level.
+    _debug_info: Optional[Tuple[str, ...]]
+
+    # A dictionary of operator -> operator metadata.
+    operators: Dict[str, SelectiveBuildOperator]
+
+    # A dictionary of selected kernel tags and dtypes. Typically a
+    # PyTorch Operator Kernel (function) may have many code paths
+    # that are specialized for many many Tensor dtypes, so it's not
+    # one per kernel function, but there could be many per kernel
+    # function. The tag isn't a kernel function name, but some fragment
+    # of the kernel function implementation itself.
+    kernel_metadata: Dict[str, List[str]]
+
+    # ExecuTorch only. A dictionary of kernel tag -> list of (list of input
+    # dtypes for tensor-like input args).
+    # This is from selective.yaml
+    et_kernel_metadata: Dict[str, List[str]]
+
+    # A set of all the custom torch bind classes used by the selected models
+    # Stored as a set internally to remove duplicates proactively, but written
+    # as a list to yamls
+    custom_classes: Set[str]
+
+    # A set of all the build features used by the selected models
+    # Stored as a set internally to remove duplicates proactively, but written
+    # as a list to yamls
+    build_features: Set[str]
+
+    # If true, then fragments for all dtypes for all kernel functions
+    # are included as well as all custom classes. This is typically set when any one of the
+    # operator lists is generated from a mechanism other than
+    # tracing based selective build.
+    include_all_non_op_selectives: bool
+
+    @staticmethod
+    def get_nop_selector() -> "SelectiveBuilder":
+        return SelectiveBuilder.from_yaml_dict({"include_all_operators": True})
+
+    @staticmethod
+    def from_yaml_dict(data: Dict[str, object]) -> "SelectiveBuilder":
+        valid_top_level_keys = {
+            "include_all_non_op_selectives",
+            "include_all_operators",
+            "debug_info",
+            "operators",
+            "kernel_metadata",
+            "et_kernel_metadata",
+            "custom_classes",
+            "build_features",
+        }
+        top_level_keys = set(data.keys())
+        if len(top_level_keys - valid_top_level_keys) > 0:
+            raise Exception(
+                "Got unexpected top level keys: {}".format(
+                    ",".join(top_level_keys - valid_top_level_keys),
+                )
+            )
+        include_all_operators = data.get("include_all_operators", False)
+        assert isinstance(include_all_operators, bool)
+
+        debug_info = None
+        if "debug_info" in data:
+            di_list = data["debug_info"]
+            assert isinstance(di_list, list)
+
+            debug_info = tuple(str(x) for x in di_list)
+
+        operators = {}
+        operators_dict = data.get("operators", {})
+        assert isinstance(operators_dict, dict)
+
+        for k, v in operators_dict.items():
+            operators[k] = SelectiveBuildOperator.from_yaml_dict(k, v)
+
+        kernel_metadata = {}
+        kernel_metadata_dict = data.get("kernel_metadata", {})
+        assert isinstance(kernel_metadata_dict, dict)
+
+        for k, v in kernel_metadata_dict.items():
+            kernel_metadata[str(k)] = [str(dtype) for dtype in v]
+
+        et_kernel_metadata = data.get("et_kernel_metadata", {})
+        assert isinstance(et_kernel_metadata, dict)
+
+        custom_classes = data.get("custom_classes", [])
+        assert isinstance(custom_classes, Iterable)
+        custom_classes = set(custom_classes)
+
+        build_features = data.get("build_features", [])
+        assert isinstance(build_features, Iterable)
+        build_features = set(build_features)
+
+        include_all_non_op_selectives = data.get("include_all_non_op_selectives", False)
+        assert isinstance(include_all_non_op_selectives, bool)
+
+        return SelectiveBuilder(
+            include_all_operators,
+            debug_info,
+            operators,
+            kernel_metadata,
+            et_kernel_metadata,
+            custom_classes,  # type: ignore[arg-type]
+            build_features,  # type: ignore[arg-type]
+            include_all_non_op_selectives,
+        )
+
+    @staticmethod
+    def from_yaml_str(config_contents: str) -> "SelectiveBuilder":
+        contents = yaml.safe_load(config_contents)
+        return SelectiveBuilder.from_yaml_dict(contents)
+
+    @staticmethod
+    def from_yaml_path(config_path: str) -> "SelectiveBuilder":
+        with open(config_path) as f:
+            contents = yaml.safe_load(f)
+            return SelectiveBuilder.from_yaml_dict(contents)
+
+    @staticmethod
+    def from_legacy_op_registration_allow_list(
+        allow_list: Set[str], is_root_operator: bool, is_used_for_training: bool
+    ) -> "SelectiveBuilder":
+        operators = {}
+        for op in allow_list:
+            operators[op] = {
+                "name": op,
+                "is_root_operator": is_root_operator,
+                "is_used_for_training": is_used_for_training,
+                "include_all_overloads": True,
+            }
+        return SelectiveBuilder.from_yaml_dict(
+            {
+                "operators": operators,
+                "include_all_non_op_selectives": True,
+            }
+        )
+
+    def is_operator_selected(self, name: str) -> bool:
+        if self.include_all_operators:
+            return True
+
+        if name in self.operators:
+            return True
+        name = strip_operator_overload_name(name)
+        return name in self.operators and self.operators[name].include_all_overloads
+
+    def is_native_function_selected(self, func: NativeFunction) -> bool:
+        op_name = op_name_from_native_function(func)
+        return self.is_operator_selected(op_name)
+
+    def is_operator_selected_for_training(self, name: str) -> bool:
+        if not self.is_operator_selected(name):
+            return False
+        if self.include_all_operators:
+            return True
+
+        not_training_op = SelectiveBuildOperator(
+            name="",
+            is_root_operator=False,
+            is_used_for_training=False,
+            include_all_overloads=False,
+            _debug_info=None,
+        )
+        op = not_training_op
+        if name in self.operators:
+            op = self.operators[name]
+
+        name = strip_operator_overload_name(name)
+        base_op = not_training_op
+        if name in self.operators:
+            base_op = self.operators[name]
+
+        return op.is_used_for_training or (
+            base_op.include_all_overloads and base_op.is_used_for_training
+        )
+
+    def is_native_function_selected_for_training(self, func: NativeFunction) -> bool:
+        op_name = op_name_from_native_function(func)
+        return self.is_operator_selected_for_training(op_name)
+
+    def is_root_operator(self, name: str) -> bool:
+        if not self.is_operator_selected(name):
+            return False
+        if self.include_all_operators:
+            return True
+
+        if name in self.operators:
+            op: SelectiveBuildOperator = self.operators[name]
+            return op.is_root_operator
+        name = strip_operator_overload_name(name)
+        if name not in self.operators:
+            return False
+        base_op: SelectiveBuildOperator = self.operators[name]
+        return base_op.include_all_overloads and base_op.is_root_operator
+
+    def is_kernel_dtype_selected(self, kernel_tag: str, dtype: str) -> bool:
+        if self.include_all_operators or self.include_all_non_op_selectives:
+            return True
+
+        return (
+            kernel_tag in self.kernel_metadata
+            and dtype in self.kernel_metadata[kernel_tag]
+        )
+
+    def et_get_selected_kernels(self, op_name: str, kernel_key: List[str]) -> List[str]:
+        """
+        Return a list of kernel keys that cover the used ops
+        """
+        # If no kernel metadata, either it's implied by include_all_operators=True or the op is not used.
+        if op_name not in self.et_kernel_metadata:
+            return kernel_key if self.include_all_operators else []
+        # Otherwise, only return the specific kernel keys.
+
+        result_set = set()
+
+        for model_kernel_keys in self.et_kernel_metadata[op_name]:
+            key_found = False
+            for key in kernel_key:
+                # Don't compare the version for now
+                if (
+                    key != "default"
+                    and key.split("/")[1] == model_kernel_keys.split("/")[1]
+                ):
+                    result_set.add(key)
+                    key_found = True
+                    break
+            if not key_found:
+                if "default" not in kernel_key:
+                    raise Exception("Missing kernel for the model")
+                else:
+                    result_set.add("default")
+
+        return list(result_set)
+
+    def to_dict(self) -> Dict[str, object]:
+        ret: Dict[str, object] = {
+            "include_all_non_op_selectives": self.include_all_non_op_selectives,
+            "include_all_operators": self.include_all_operators,
+        }
+        operators = {}
+        for op_name, op in self.operators.items():
+            operators[op_name] = op.to_dict()
+        ret["operators"] = operators
+
+        if self._debug_info is not None:
+            ret["debug_info"] = sorted(self._debug_info)
+
+        ret["kernel_metadata"] = {
+            k: sorted(v) for (k, v) in self.kernel_metadata.items()
+        }
+
+        ret["et_kernel_metadata"] = self.et_kernel_metadata
+
+        ret["custom_classes"] = sorted(self.custom_classes)
+
+        ret["build_features"] = sorted(self.build_features)
+
+        return ret
+
+
+def merge_kernel_metadata(
+    lhs: Dict[str, List[str]],
+    rhs: Dict[str, List[str]],
+) -> Dict[str, List[str]]:
+    kernel_metadata: Dict[str, List[str]] = {}
+    for tag_name, dtypes in list(lhs.items()) + list(rhs.items()):
+        dtypes_copy = set(dtypes)
+        if tag_name in kernel_metadata:
+            dtypes_copy |= set(kernel_metadata[tag_name])
+
+        kernel_metadata[tag_name] = list(dtypes_copy)
+
+    return kernel_metadata
+
+
+def merge_et_kernel_metadata(
+    lhs: Dict[str, List[str]],
+    rhs: Dict[str, List[str]],
+) -> Dict[str, List[str]]:
+    merge_et_kernel_metadata: Dict[str, Set[str]] = defaultdict(set)
+    for op in list(lhs.keys()) + list(rhs.keys()):
+        merge_et_kernel_metadata[op].update(lhs.get(op, []))
+        merge_et_kernel_metadata[op].update(rhs.get(op, []))
+
+    return {op: sorted(val) for op, val in merge_et_kernel_metadata.items()}
+
+
+def combine_selective_builders(
+    lhs: SelectiveBuilder, rhs: SelectiveBuilder
+) -> SelectiveBuilder:
+    include_all_operators = lhs.include_all_operators or rhs.include_all_operators
+    debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info)
+    operators = merge_operator_dicts(lhs.operators, rhs.operators)
+    kernel_metadata = merge_kernel_metadata(lhs.kernel_metadata, rhs.kernel_metadata)
+    et_kernel_metadata = merge_et_kernel_metadata(
+        lhs.et_kernel_metadata, rhs.et_kernel_metadata
+    )
+    include_all_non_op_selectives = (
+        lhs.include_all_non_op_selectives or rhs.include_all_non_op_selectives
+    )
+    custom_classes = lhs.custom_classes.union(rhs.custom_classes)
+    build_features = lhs.build_features.union(rhs.build_features)
+    return SelectiveBuilder(
+        include_all_operators,
+        debug_info,
+        operators,
+        kernel_metadata,
+        et_kernel_metadata,
+        custom_classes,
+        build_features,
+        include_all_non_op_selectives,
+    )
+
+
+def op_name_from_native_function(f: NativeFunction) -> str:
+    # This was originally read from the 'operator_name_with_overload' field in the
+    # declaration dict, which was the part before the first '(' in 'schema_string'.
+    return f"{f.namespace}::{f.func.name}"
diff --git a/MLPY/Lib/site-packages/torchgen/static_runtime/__init__.py b/MLPY/Lib/site-packages/torchgen/static_runtime/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1289666e13a6ee9d4f5067d6ce89faeee99510c
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/config.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e95c3864e31a5eb63876237abb87f11faec9d5d
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/gen_static_runtime_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/gen_static_runtime_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9997a75425173cb1c85a8f111095ab93565ffc9
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/gen_static_runtime_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/generator.cpython-39.pyc b/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/generator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0d627541203cb0f9716f4f5aafb7aaee216f4f0
Binary files /dev/null and b/MLPY/Lib/site-packages/torchgen/static_runtime/__pycache__/generator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torchgen/static_runtime/config.py b/MLPY/Lib/site-packages/torchgen/static_runtime/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..b218af12279eca80eaf422e47ac9a82b525266d2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/static_runtime/config.py
@@ -0,0 +1,388 @@
+from typing import Dict, Union
+
+from torchgen.model import NativeFunctionsGroup, NativeFunctionsViewGroup
+
+
+def func_name_base_str(g: Union[NativeFunctionsGroup, NativeFunctionsViewGroup]) -> str:
+    if isinstance(g, NativeFunctionsGroup):
+        return str(g.functional.func.name.name.base)
+    else:
+        return str(g.view.root_name)
+
+
+is_hand_written_ops_ = frozenset(
+    (
+        "abs",
+        "add",
+        "addmm",
+        "all",
+        "any",
+        "argmin",
+        "bmm",
+        "clamp",
+        "clamp_min",
+        "cumsum",
+        "div",
+        "fmod",
+        "index_select",
+        "leaky_relu",
+        "linear",
+        "log",
+        "matmul",
+        "mul",
+        "narrow_copy",
+        "nonzero",
+        "pow",
+        "remainder",
+        "sigmoid",
+        "sign",
+        "sub",
+        "tanh",
+        "detach",
+        "expand_as",
+        "flatten",
+        "narrow",
+        "reshape_as",
+        "select",
+        "slice",
+        "softmax",
+        "split",
+        "squeeze",
+        "transpose",
+        "view",
+        "where",
+    )
+)
+
+
+def is_hand_written(g: Union[NativeFunctionsGroup, NativeFunctionsViewGroup]) -> bool:
+    name_base = func_name_base_str(g)
+    return name_base in is_hand_written_ops_
+
+
+def override_test_values(arg_map: Dict[str, str], op_name: str, index: int) -> None:
+    assert index == 0 or index == 1
+    if op_name == "addr":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+            arg_map["vec1"] = "at::rand({6})"
+            arg_map["vec2"] = "at::rand({6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+            arg_map["vec1"] = "at::rand({22})"
+            arg_map["vec2"] = "at::rand({22})"
+        return
+    if op_name == "mv":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+            arg_map["vec"] = "at::rand({6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+            arg_map["vec"] = "at::rand({22})"
+        return
+    if op_name == "addbmm":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+        return
+    if op_name == "cross":
+        if index == 0:
+            arg_map["self"] = "at::rand({3, 3, 3})"
+            arg_map["other"] = "at::rand({3, 3, 3})"
+        else:
+            arg_map["self"] = "at::rand({22, 3, 22})"
+            arg_map["other"] = "at::rand({22, 3, 22})"
+        return
+    if op_name == "take":
+        if index == 0:
+            arg_map["index"] = "at::randint(0, 216, {20}, torch::kInt64)"
+        else:
+            arg_map["index"] = "at::randint(0, 1000, {100}, torch::kInt64)"
+        return
+    if op_name == "take_along_dim":
+        if index == 0:
+            arg_map["indices"] = "at::argsort(self0, 1, true)"
+        else:
+            arg_map["indices"] = "at::argsort(self1, 1, true)"
+        return
+    if op_name == "masked_select":
+        if index == 0:
+            arg_map["mask"] = "at::randn({6, 6, 6}) > 0.5"
+        else:
+            arg_map["mask"] = "at::rand({22, 22, 22}) > 0.5"
+        return
+    if op_name == "orgqr":
+        if index == 0:
+            arg_map["input2"] = "at::rand({6, 6})"
+        else:
+            arg_map["input2"] = "at::rand({22, 22})"
+        return
+    if op_name == "ormqr":
+        if index == 0:
+            arg_map["input2"] = "at::rand({6, 6})"
+        else:
+            arg_map["input2"] = "at::rand({22, 22})"
+        return
+    if op_name == "quantile":
+        if index == 0:
+            arg_map["q"] = "at::rand({6})"
+            arg_map["interpolation"] = '"linear"'
+        else:
+            arg_map["q"] = "at::rand({22})"
+            arg_map["interpolation"] = '"linear"'
+        return
+    if op_name == "nanquantile":
+        if index == 0:
+            arg_map["q"] = "at::rand({6})"
+            arg_map["interpolation"] = '"linear"'
+        else:
+            arg_map["q"] = "at::rand({22})"
+            arg_map["interpolation"] = '"linear"'
+        return
+    if op_name == "multi_margin_loss":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+            arg_map["target"] = "at::randint(6, {6}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+            arg_map["target"] = "at::randint(22, {22}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({22})"
+        return
+    if op_name == "multilabel_margin_loss":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+            arg_map["target"] = "at::randint(6, {6, 6}, torch::kInt64)"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+            arg_map["target"] = "at::randint(22, {22, 22}, torch::kInt64)"
+        return
+    if op_name == "nll_loss":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+            arg_map["target"] = "at::randint(6, {6}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+            arg_map["target"] = "at::randint(22, {22}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({22})"
+        return
+    if op_name == "nll_loss2d":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6, 6, 6})"
+            arg_map["target"] = "at::randint(6, {6, 6, 6}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22, 22, 22})"
+            arg_map["target"] = "at::randint(22, {22, 22, 22}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({22})"
+        return
+    if op_name in (
+        "fft_fft",
+        "fft_ifft",
+        "fft_rfft",
+        "fft_irfft",
+        "fft_hfft",
+        "fft_ihfft",
+    ):
+        arg_map["norm"] = '"forward"'
+        return
+    if op_name == "linalg_tensorinv":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6, 6, 6})"
+            arg_map["ind"] = "2"
+        else:
+            arg_map["self"] = "at::rand({22, 22, 22, 22})"
+            arg_map["ind"] = "2"
+        return
+    if op_name == "addmv":
+        if index == 0:
+            arg_map["self"] = "at::rand({2})"
+            arg_map["mat"] = "at::rand({2, 2})"
+            arg_map["vec"] = "at::rand({2})"
+        else:
+            arg_map["self"] = "at::rand({35})"
+            arg_map["mat"] = "at::rand({35, 35})"
+            arg_map["vec"] = "at::rand({35})"
+        return
+    if op_name == "acosh":
+        if index == 0:
+            arg_map["self"] = "at::rand({2, 2, 2}) + at::ones({2, 2, 2})"
+        else:
+            arg_map["self"] = "at::rand({5, 5, 5}) + at::ones({5, 5, 5})"
+        return
+    if op_name == "adaptive_max_pool2d_backward":
+        if index == 0:
+            arg_map["grad_output"] = "at::rand({2, 2, 2}, at::kFloat)"
+            arg_map["self"] = "at::rand({2, 2, 2}, at::kFloat)"
+            arg_map["indices"] = "at::randint(0, 1, {2, 2, 2}, at::kLong)"
+        else:
+            arg_map["grad_output"] = "at::rand({3, 3, 3}, at::kFloat)"
+            arg_map["self"] = "at::rand({3, 3, 3}, at::kFloat)"
+            arg_map["indices"] = "at::randint(0, 1, {3, 3, 3}, at::kLong)"
+        return
+    if op_name == "adaptive_max_pool3d_backward":
+        if index == 0:
+            arg_map["grad_output"] = "at::rand({2, 2, 2, 2}, at::kFloat)"
+            arg_map["self"] = "at::rand({2, 2, 2, 2}, at::kFloat)"
+            arg_map["indices"] = "at::randint(0, 1, {2, 2, 2, 2}, at::kLong)"
+        else:
+            arg_map["grad_output"] = "at::rand({3, 3, 3, 3}, at::kFloat)"
+            arg_map["self"] = "at::rand({3, 3, 3, 3}, at::kFloat)"
+            arg_map["indices"] = "at::randint(0, 1, {3, 3, 3, 3}, at::kLong)"
+        return
+    if op_name == "bitwise_left_shift":
+        if index == 0:
+            arg_map["self"] = "at::randint(1, 1 << 4, {6, 6, 6}, at::kInt)"
+            arg_map["other"] = "at::randint(1, 26, {6, 6, 6}, at::kInt)"
+        else:
+            arg_map["self"] = "at::randint(1, 1 << 4, {22, 22, 22}, at::kInt)"
+            arg_map["other"] = "at::randint(1, 26, {22, 22, 22}, at::kInt)"
+        return
+    if op_name == "bitwise_right_shift":
+        if index == 0:
+            arg_map["self"] = "at::randint(1 << 21, 1 << 30, {6, 6, 6}, at::kInt)"
+            arg_map["other"] = "at::randint(1, 22, {6, 6, 6}, at::kInt)"
+        else:
+            arg_map["self"] = "at::randint(1 << 21, 1 << 30, {22, 22, 22}, at::kInt)"
+            arg_map["other"] = "at::randint(1, 22, {22, 22, 22}, at::kInt)"
+        return
+    if op_name == "gather":
+        if index == 0:
+            arg_map["self"] = "at::randint(1, 100, {2,2,2}, at::kInt)"
+            arg_map["dim"] = "1"
+            arg_map["index"] = "at::randint(0, 1, {2,2,2}, torch::kInt64)"
+            arg_map["sparse_grad"] = "false"
+        else:
+            arg_map["self"] = "at::randint(1, 100, {5,5,5}, at::kInt)"
+            arg_map["dim"] = "1"
+            arg_map["index"] = "at::randint(0, 4, {5,5,5}, torch::kInt64)"
+            arg_map["sparse_grad"] = "false"
+        return
+    if op_name == "gelu":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6, 6})"
+            arg_map["approximate"] = '"tanh"'
+        else:
+            arg_map["self"] = "at::rand({22, 22, 22})"
+            arg_map["approximate"] = '"tanh"'
+        return
+    if op_name == "gelu_backward":
+        if index == 0:
+            arg_map["grad_output"] = "at::rand({6, 6, 6})"
+            arg_map["self"] = "at::rand({6, 6, 6})"
+            arg_map["approximate"] = '"tanh"'
+        else:
+            arg_map["grad_output"] = "at::rand({22, 22, 22})"
+            arg_map["self"] = "at::rand({22, 22, 22})"
+            arg_map["approximate"] = '"tanh"'
+        return
+    if op_name == "index_add":
+        if index == 0:
+            arg_map["self"] = "at::rand({2})"
+            arg_map["dim"] = "0"
+            arg_map["index"] = "at::randint(0, 1, {2}, at::kInt)"
+            arg_map["source"] = "at::rand({2})"
+            arg_map["alpha"] = "2"
+        else:
+            arg_map["self"] = "at::rand({16})"
+            arg_map["dim"] = "0"
+            arg_map["index"] = "at::randint(0, 10, {16}, at::kInt)"
+            arg_map["source"] = "at::rand({16})"
+            arg_map["alpha"] = "2"
+        return
+    if op_name == "index_copy":
+        if index == 0:
+            arg_map["self"] = "at::rand({2})"
+            arg_map["dim"] = "0"
+            arg_map["index"] = "at::randint(0, 1, {2}, at::kLong)"
+            arg_map["source"] = "at::rand({2})"
+        else:
+            arg_map["self"] = "at::rand({32})"
+            arg_map["dim"] = "0"
+            arg_map["index"] = "at::randint(0, 10, {32}, at::kLong)"
+            arg_map["source"] = "at::rand({32})"
+        return
+    if op_name == "linalg_cross":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 3, 6})"
+            arg_map["other"] = "at::rand({6, 3, 6})"
+            arg_map["dim"] = "1"
+        else:
+            arg_map["self"] = "at::rand({22, 3, 22})"
+            arg_map["other"] = "at::rand({22, 3, 22})"
+            arg_map["dim"] = "1"
+        return
+    if op_name == "nll_loss_backward":
+        if index == 0:
+            arg_map["grad_output"] = "at::rand({})"
+            arg_map["self"] = "at::rand({6})"
+            arg_map["target"] = "at::randint(0, 5, {6}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({6})"
+            arg_map["reduction"] = "1"
+            arg_map["ignore_index"] = "1"
+            arg_map["total_weight"] = "at::rand({})"
+        else:
+            arg_map["grad_output"] = "at::rand({})"
+            arg_map["self"] = "at::rand({36})"
+            arg_map["target"] = "at::randint(0, 11, {36}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({36})"
+            arg_map["reduction"] = "1"
+            arg_map["ignore_index"] = "1"
+            arg_map["total_weight"] = "at::rand({})"
+        return
+    if op_name in ["scatter", "scatter_add", "_scatter_reduce"]:
+        if index == 0:
+            arg_map["self"] = "at::randint(1, 100, {2,2,2}, torch::kInt64)"
+            arg_map["index"] = "at::randint(0, 1, {2,2,2}, torch::kInt64)"
+            arg_map["src"] = "at::randint(1, 100, {2,2,2}, torch::kInt64)"
+        else:
+            arg_map["self"] = "at::randint(1, 100, {5,5,5}, torch::kInt64)"
+            arg_map["index"] = "at::randint(0, 1, {5,5,5}, torch::kInt64)"
+            arg_map["src"] = "at::randint(1, 100, {5,5,5}, torch::kInt64)"
+        if "reduce" in arg_map:
+            arg_map["reduce"] = '"sum"' if op_name == "_scatter_reduce" else '"add"'
+        return
+    if op_name == "scatter_reduce":
+        arg_map["reduce"] = '"mean"'
+        if index == 0:
+            arg_map["index"] = "at::randint(6, {6, 6, 6}, torch::kInt64)"
+        else:
+            arg_map["index"] = "at::randint(22, {22, 22, 22}, torch::kInt64)"
+        return
+    if op_name == "special_zeta":
+        if index == 0:
+            arg_map["self"] = "at::rand({2,2,2}, at::kDouble) + at::ones({2,2,2})"
+            arg_map["other"] = "at::rand({2,2,2}, at::kDouble) + at::ones({2,2,2})"
+        else:
+            arg_map["self"] = "at::rand({5,5,5}, at::kDouble) + at::ones({5,5,5})"
+            arg_map["other"] = "at::rand({5,5,5}, at::kDouble) + at::ones({5,5,5})"
+        return
+    if op_name == "_convert_indices_from_csr_to_coo":
+        if index == 0:
+            arg_map["crow_indices"] = "torch::tensor({1}, torch::kInt32)"
+            arg_map["col_indices"] = "torch::tensor({0, 1, 0}, torch::kInt32)"
+            arg_map["out_int32"] = "false"
+        else:
+            arg_map["crow_indices"] = "torch::tensor({0}, torch::kInt32)"
+            arg_map[
+                "col_indices"
+            ] = "torch::tensor({0, 1, 0, 2, 1, 2, 0, 1, 0, 2, 1, 2}, torch::kInt32)"
+            arg_map["out_int32"] = "false"
+        return
+    if op_name == "_convert_indices_from_coo_to_csr":
+        if index == 0:
+            arg_map["self"] = "at::randint(0, 3, {2}, at::kInt)"
+            arg_map["size"] = "10"
+            arg_map["out_int32"] = "false"
+        else:
+            arg_map["self"] = "at::randint(0, 3, {12}, at::kInt)"
+            arg_map["size"] = "24"
+            arg_map["out_int32"] = "false"
+        return
+    if op_name in ("diagonal", "linalg_diagonal"):
+        arg_map["offset"] = "0"
+        arg_map["dim0"] = "1"
+        arg_map["dim1"] = "2"
+        return
diff --git a/MLPY/Lib/site-packages/torchgen/static_runtime/gen_static_runtime_ops.py b/MLPY/Lib/site-packages/torchgen/static_runtime/gen_static_runtime_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d92ccc692e89fd7a4b406cf32ee90151437b7263
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/static_runtime/gen_static_runtime_ops.py
@@ -0,0 +1,228 @@
+import argparse
+import itertools
+import os
+from typing import Sequence, TypeVar, Union
+
+from libfb.py.log import set_simple_logging  # type: ignore[import]
+
+from torchgen import gen
+from torchgen.context import native_function_manager
+from torchgen.model import DispatchKey, NativeFunctionsGroup, NativeFunctionsViewGroup
+from torchgen.static_runtime import config, generator
+
+# Given a list of `grouped_native_functions` sorted by their op names, return a list of
+# lists each of which groups ops that share the base name. For example, `mean` and
+# `mean.dim` are grouped together by this function.
+
+NativeGroupT = TypeVar(
+    "NativeGroupT",
+    bound=Union[NativeFunctionsGroup, NativeFunctionsViewGroup],
+)
+
+
+def group_functions_by_op_name(
+    grouped_native_functions: Sequence[NativeGroupT],
+) -> Sequence[Sequence[NativeGroupT]]:
+    if not grouped_native_functions:
+        return []
+    groups = []
+
+    def is_supported(g: Union[NativeFunctionsGroup, NativeFunctionsViewGroup]) -> bool:
+        with native_function_manager(g):
+            return generator.is_supported(g)
+
+    eligible_ops = (g for g in grouped_native_functions if is_supported(g))
+    groups = [
+        list(group)
+        for k, group in (
+            itertools.groupby(
+                eligible_ops,
+                key=config.func_name_base_str,
+            )
+        )
+    ]
+
+    return groups
+
+
+def clang_format(cpp_file_path: str) -> None:
+    import subprocess
+
+    subprocess.check_call(["clang-format", "-i", cpp_file_path])
+
+
+def write_cpp(cpp_ops: Sequence[str], file_path: str) -> None:
+    code = "\n".join(cpp_ops)
+    generated = f"""// @lint-ignore-every CLANGTIDY HOWTOEVEN
+// AUTO-GENERATED FROM: torchgen/static_runtime/gen_static_runtime_ops.py
+#include <torch/csrc/jit/runtime/static/ops.h>
+
+#include <ATen/CPUFunctions.h>
+#include <ATen/InferSize.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Parallel.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/EmbeddingBag.h>
+#include <ATen/native/Fill.h>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/native/NonSymbolicBC.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/TensorAdvancedIndexing.h>
+#include <ATen/native/cpu/SerialStackImpl.h>
+#include <ATen/native/layer_norm.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/qembeddingbag.h>
+#include <ATen/native/quantized/cpu/qembeddingbag_prepack.h>
+#include <ATen/quantized/QTensorImpl.h>
+#include <ATen/quantized/Quantizer.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+#include <torch/csrc/jit/runtime/static/te_wrapper.h>
+#include <torch/csrc/jit/runtime/vararg_functions.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+
+namespace torch {{
+namespace jit {{
+
+{code}
+
+}} // namespace jit
+}} // namespace torch
+"""
+    with open(file_path, "w") as f:
+        f.write(generated)
+    clang_format(file_path)
+
+
+def write_test_cpp(cpp_ops: Sequence[str], file_path: str) -> None:
+    code = "\n".join(cpp_ops)
+    generated = f"""// @lint-ignore-every CLANGTIDY HOWTOEVEN
+// AUTO-GENERATED FROM: torchgen/static_runtime/gen_static_runtime_ops.py
+#include <gtest/gtest.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+#include <torch/torch.h>
+
+#include "test_utils.h"
+
+using namespace caffe2;
+using namespace torch;
+using namespace torch::jit;
+using namespace torch::jit::test;
+using c10::IValue;
+
+{code}
+
+"""
+    with open(file_path, "w") as f:
+        f.write(generated)
+    clang_format(file_path)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate ATen source files")
+    parser.add_argument(
+        "-s",
+        "--source-path",
+        help="path to source directory for ATen",
+        default="caffe2/aten/src/ATen",
+    )
+    parser.add_argument(
+        "-p",
+        "--generated-ops-cpp-path",
+        help="path to directory to generate op dispatcher .cpp file",
+        default="caffe2/torch/csrc/jit/runtime/static/generated_ops.cpp",
+    )
+    parser.add_argument(
+        "-t",
+        "--generated-ops-test-cpp-path",
+        help="path to directory to generate op dispatcher .cpp file",
+        default="caffe2/benchmarks/static_runtime/test_generated_ops.cc",
+    )
+    options = parser.parse_args()
+    native_yaml_path = os.path.join(options.source_path, "native/native_functions.yaml")
+    tags_yaml_path = os.path.join(options.source_path, "native/tags.yaml")
+    parsed_yaml = gen.parse_native_yaml(native_yaml_path, tags_yaml_path)
+    native_functions, backend_indices = (
+        parsed_yaml.native_functions,
+        parsed_yaml.backend_indices,
+    )
+
+    op_generator = generator.GenOpDispatcher()
+    test_case_generator = generator.GenOpTestCase()
+
+    native_functions_groups = [
+        g
+        for g in gen.get_grouped_native_functions(native_functions)
+        if isinstance(g, NativeFunctionsGroup)
+    ]
+
+    supported_functions_groups = group_functions_by_op_name(native_functions_groups)
+
+    out_variant_op_result = [
+        op_generator.out_variant(groups, backend_indices[DispatchKey.CPU])
+        for groups in supported_functions_groups
+    ]
+    out_variant_test_result = [
+        test_case_generator.out_variant(groups) for groups in supported_functions_groups
+    ]
+
+    native_functions_view_groups = [
+        g
+        for g in gen.get_grouped_by_view_native_functions(native_functions)
+        if isinstance(g, NativeFunctionsViewGroup)
+    ]
+
+    supported_functions_view_groups = group_functions_by_op_name(
+        native_functions_view_groups
+    )
+
+    view_op_result = [
+        op_generator.view(groups, backend_indices[DispatchKey.CPU])
+        for groups in supported_functions_view_groups
+    ]
+    view_test_result = [
+        test_case_generator.view(groups) for groups in supported_functions_view_groups
+    ]
+
+    op_result = out_variant_op_result + ["\n\n"] + view_op_result
+    test_result = out_variant_test_result + ["\n\n"] + view_test_result
+
+    write_cpp(op_result, options.generated_ops_cpp_path)
+    write_test_cpp(test_result, options.generated_ops_test_cpp_path)
+
+    print(
+        "\ntotal grouped native ops: %d"
+        % len(gen.get_grouped_native_functions(native_functions))
+    )
+
+    print("grouped native ops with out variant: %d" % len(native_functions_groups))
+    supported_functions_num = sum(
+        [len(groups) for groups in supported_functions_groups]
+    )
+    print("generated functions groups with out variant: %d" % supported_functions_num)
+
+    print("\nview grouped native ops: %d" % len(native_functions_view_groups))
+    supported_view_functions_num = sum(
+        [len(groups) for groups in supported_functions_view_groups]
+    )
+    print("generated functions view groups: %d" % supported_view_functions_num)
+
+    print(
+        "\noverall generated : %d"
+        % (supported_functions_num + supported_view_functions_num)
+    )
+
+
+if __name__ == "__main__":
+    set_simple_logging(escape_newlines=False)
+    main()
diff --git a/MLPY/Lib/site-packages/torchgen/static_runtime/generator.py b/MLPY/Lib/site-packages/torchgen/static_runtime/generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..d206d911fb70395d3bcae13a1bcfefbf107e332d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/static_runtime/generator.py
@@ -0,0 +1,796 @@
+import json
+import logging
+
+import math
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torchgen.api.cpp as cpp
+from torchgen.context import native_function_manager
+from torchgen.model import (
+    Argument,
+    BackendIndex,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    NativeFunctionsGroup,
+    NativeFunctionsViewGroup,
+    OptionalType,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.static_runtime import config
+
+logger: logging.Logger = logging.getLogger()
+
+
+def has_alias(
+    arguments: Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]
+) -> bool:
+    for arg in arguments:
+        annotation = getattr(arg, "annotation", None)
+        if not annotation:
+            continue
+        alias_set = getattr(annotation, "alias_set", ())
+        if alias_set:
+            return True
+    return False
+
+
+BLOCKED_OPS = frozenset(
+    (
+        # non cpu ops
+        "sparse_sampled_addmm",
+        "hspmm",
+        "linalg_svdvals",
+        # sparse ops
+        "sspaddmm",
+        "coalesce",
+        "_indices",
+        "indices",
+        "_values",
+        "values",
+        "crow_indices",
+        "col_indices",
+        # deprecated ops
+        "floor_divide",
+        "ger",
+        # buggy ops
+        "conj_physical",  # P495807361
+        "binary_cross_entropy",  # P496394764
+        "arccosh",
+        # uncommon ops
+        "cholesky",
+        "lu_solve",
+        "linalg_cholesky",
+        "linalg_householder_product",
+        "linalg_ldl_solve",
+        "_compute_linear_combination",
+        # training related ops
+        "_make_dual",
+        # cannot call directly
+        "_fw_primal",
+        # no documentation
+        "_index_reduce",
+        # TODO: these ones got added recently and need manual inspection
+        "_new_zeros_with_same_feature_meta",
+        "_conj_physical",
+        "binary_cross_entropy_with_logits",
+        "bincount",
+        "conv_tbc",
+        "copy",
+        "_copy_from",
+        "_copy_from_and_resize",
+        "count_nonzero",
+        "cudnn_affine_grid_generator",
+        "cudnn_affine_grid_generator_backward",
+        "cudnn_grid_sampler",
+        "diag_embed",
+        "embedding",
+        "embedding_dense_backward",
+        "_embedding_bag_dense_backward",
+        "_embedding_bag_per_sample_weights_backward",
+        "grid_sampler_2d",
+        "_grid_sampler_2d_cpu_fallback",
+        "grid_sampler_3d",
+        "isnan",
+        "mkldnn_linear",
+        "median",
+        "nanmedian",
+        "_sparse_sparse_matmul",
+        "batch_norm_backward_elemt",
+        "_euclidean_dist",
+        "pixel_shuffle",
+        "pixel_unshuffle",
+        "channel_shuffle",
+        "_reshape_nested_backward",
+        "relu",
+        "prelu",
+        "celu",
+        "slice_scatter",
+        "select_scatter",
+        "diagonal_scatter",
+        "sum",
+        "_mkldnn_transpose",
+        "_nested_tensor_from_mask",
+        "_nested_from_padded",
+        "_nested_tensor_size",
+        "_nested_from_padded_and_nested_example",
+        "_standard_gamma_grad",
+        "_dirichlet_grad",
+        "native_norm",
+        "_sparse_softmax",
+        "_sparse_softmax_backward_data",
+        "_sparse_log_softmax",
+        "_sparse_log_softmax_backward_data",
+        "zero",
+        "_sparse_addmm",
+        "sparse_mask",
+        "_sparse_mask_projection",
+        "_to_dense",
+        "_coalesce",
+        "_coalesced",
+        "copy_sparse_to_sparse",
+        "to_sparse",
+        "to_sparse_csr",
+        "to_sparse_csc",
+        "to_mkldnn",
+        "quantize_per_tensor_dynamic",
+        "quantize_per_channel",
+        "q_per_channel_scales",
+        "q_per_channel_zero_points",
+        "int_repr",
+        "_make_per_channel_quantized_tensor",
+        "set",
+        "lift",
+        "lift_fresh",
+        "lift_fresh_copy",
+        "masked_scatter",
+        "_masked_softmax",
+        "_masked_softmax_backward",
+        "put",
+        "index_reduce",
+        "trace",
+        "_cholesky_solve_helper",
+        "dist",
+        "max",
+        "_torch_cuda_cu_linker_symbol_op",
+        "glu_jvp",
+        "glu_backward_jvp",
+        "hardswish_backward",
+        "rrelu_with_noise_backward",
+        "mkldnn_adaptive_avg_pool2d_backward",
+        "_adaptive_avg_pool2d_backward",
+        "_adaptive_avg_pool3d_backward",
+        "isinf",
+        "linalg_lu_solve",
+        "linalg_vecdot",
+        "linalg_matrix_exp",
+        "linalg_eigvalsh",
+        "_test_warn_in_autograd",
+        "_test_autograd_multiple_dispatch_view",
+        "_test_autograd_multiple_dispatch_view_copy",
+        "_segment_reduce",
+        "_segment_reduce_backward",
+        "_fw_primal_copy",
+        "_make_dual_copy",
+        "view_as_real_copy",
+        "view_as_complex_copy",
+        "_conj_copy",
+        "_neg_view_copy",
+        "diagonal_copy",
+        "detach_copy",
+        "squeeze_copy",
+        "t_copy",
+        "unsqueeze_copy",
+        "_indices_copy",
+        "_values_copy",
+        "indices_copy",
+        "values_copy",
+        "crow_indices_copy",
+        "col_indices_copy",
+        "ccol_indices",
+        "ccol_indices_copy",
+        "row_indices",
+        "row_indices_copy",
+        "unfold_copy",
+        "alias_copy",
+        "_triton_multi_head_attention",
+        "special_airy_ai",
+        "special_bessel_j0",
+        "special_bessel_j1",
+        "special_bessel_y0",
+        "special_bessel_y1",
+        "special_chebyshev_polynomial_t",
+        "special_chebyshev_polynomial_u",
+        "special_chebyshev_polynomial_v",
+        "special_chebyshev_polynomial_w",
+        "special_hermite_polynomial_h",
+        "special_hermite_polynomial_he",
+        "special_laguerre_polynomial_l",
+        "special_legendre_polynomial_p",
+        "special_modified_bessel_i0",
+        "special_modified_bessel_i1",
+        "special_modified_bessel_k0",
+        "special_modified_bessel_k1",
+        "special_scaled_modified_bessel_k0",
+        "special_scaled_modified_bessel_k1",
+        "special_shifted_chebyshev_polynomial_t",
+        "special_shifted_chebyshev_polynomial_u",
+        "special_shifted_chebyshev_polynomial_v",
+        "special_shifted_chebyshev_polynomial_w",
+        "special_spherical_bessel_j0",
+        "_foobar",
+        "_nested_tensor_strides",
+    )
+)
+
+
+def is_supported(g: Union[NativeFunctionsGroup, NativeFunctionsViewGroup]) -> bool:
+    base_op_name = ""
+    func = None
+    if isinstance(g, NativeFunctionsViewGroup):
+        base_op_name = g.view.root_name
+        func = g.view.func
+    else:
+        base_op_name = g.out.func.name.name.base
+        func = g.out.func
+    if config.is_hand_written(g):
+        logger.info("HAND WRITTEN: %s", base_op_name)
+        return False
+    if base_op_name in BLOCKED_OPS:
+        logger.info("BLOCKED: %s", base_op_name)
+        return False
+    for arg in func.schema_order_arguments():
+        maybe_method = ivalue_type_conversion_method(arg.type)
+        if not maybe_method:
+            # Type converting is unsupported yet.
+            logger.info("NOT SUPPORTED TYPE CONVERTING: %s", func)
+            return False
+
+    if isinstance(g, NativeFunctionsViewGroup):
+        # TODO: stop doing type tests by converting to C++ and then testing
+        # the string, just test the dang thing directly
+        if "at::Tensor" != cpp.returns_type(func.returns, symint=False).cpp_type():
+            # Returns a non-Tensor value.
+            logger.info("NON-TENSOR RET TYPE: %s", str(func))
+            return False
+        return True
+
+    # For out variant ops, we need to check the arguments of its functional func.
+    for arg in g.functional.func.schema_order_arguments():
+        maybe_method = ivalue_type_conversion_method(arg.type)
+        if not maybe_method:
+            # Type converting is unsupported yet.
+            logger.info("NOT SUPPORTED TYPE CONVERTING: %s", g.functional.func)
+            return False
+
+    if not g.structured:
+        # In case of unstructured op, we check if it has out variant implementation.
+        # The out variant implementation satisfies the minimum requirement that it has the output tensor as the last
+        # parameter.
+        if (
+            not hasattr(g, "out")
+            or not str(func).endswith("Tensor(a!) out) -> Tensor(a!)")
+            or not str(func.name).endswith(".out")
+        ):
+            return False
+    # TODO: stop type testing by converting to C++
+    if "at::Tensor &" != cpp.returns_type(func.returns, symint=False).cpp_type():
+        logger.info("NON_TENSOR RET TYPE: %s", func)
+        return False
+    if has_alias(func.arguments.non_out):
+        # This op may create an alias of inputs.
+        logger.info("INPUTS ALIAS: %s", base_op_name)
+        return False
+    return True
+
+
+def ivalue_type_conversion_method(
+    arg_type: Union[BaseType, OptionalType, Type]
+) -> Optional[Tuple[bool, str]]:
+    """
+    Return the method call expression of `c10::ivalue' to convert its contained value to
+    the expected value of `arg_type` type. For example, for `arg_type` == BaseTy.Tensor,
+    this function returns ".toTensor()", so that it can be appended to the ivalue's
+    variable name to get the value of the expected type.
+    """
+    type_conversion_methods = {
+        BaseTy.Tensor: ((True, "toTensor()"), (False, "toOptional<at::Tensor>()")),
+        BaseTy.int: ((False, "toInt()"), (False, "toOptional<int64_t>()")),
+        BaseTy.bool: ((False, "toBool()"), (False, "toOptional<bool>()")),
+        BaseTy.Scalar: ((False, "toScalar()"), (False, "toOptional<at::Scalar>()")),
+        BaseTy.ScalarType: (
+            (False, "toScalarType()"),
+            (False, "toOptional<at::ScalarType>()"),
+        ),
+        BaseTy.str: (
+            (False, "toStringView()"),
+            (False, "toOptional<c10::string_view>()"),
+        ),
+    }
+
+    base_ty_object = None
+    if isinstance(arg_type, BaseType):
+        base_ty_object = arg_type.name
+    elif isinstance(arg_type, OptionalType):
+        if not isinstance(arg_type.elem, BaseType):
+            # ListType is currently unsupported.
+            return None
+        base_ty_object = arg_type.elem.name
+    else:
+        return None
+
+    if base_ty_object not in type_conversion_methods:
+        return None
+    methods = type_conversion_methods[base_ty_object]
+    if isinstance(arg_type, BaseType):
+        return methods[0]
+    return methods[1]
+
+
+should_use_int_tensor_ops_ = frozenset(
+    (
+        "bitwise_not",
+        "bitwise_and",
+        "bitwise_or",
+        "bitwise_xor",
+        "bitwise_left_shift",
+        "bitwise_right_shift",
+        "gcd",
+        "lcm",
+        "scatter",
+        "gather",
+        "_convert_indices_from_coo_to_csr",
+        "_convert_indices_from_csr_to_coo",
+    )
+)
+should_use_complex_tensor_ops_ = frozenset(("view_as_real", "imag", "_conj"))
+
+
+def should_use_int_tensor(op_name: str) -> bool:
+    return op_name in should_use_int_tensor_ops_
+
+
+def should_use_complex_tensor(op_name: str) -> bool:
+    return op_name in should_use_complex_tensor_ops_
+
+
+test_tensor_dim_ops_1_ = frozenset(
+    (
+        "addmv",
+        "index_add",
+        "_convert_indices_from_coo_to_csr",
+        "_convert_indices_from_csr_to_coo",
+        "nll_loss_backward",
+        "dot",
+        "vdot",
+        "outer",
+        "ger",
+    )
+)
+test_tensor_dim_ops_2_ = frozenset(
+    ("addmm", "mm", "nuclear_norm", "diag", "_addmm_activation", "matrix_H", "t")
+)
+
+
+def test_tensor_dim(op_name: str) -> int:
+    if op_name in test_tensor_dim_ops_1_:
+        return 1
+    if op_name in test_tensor_dim_ops_2_:
+        return 2
+    return 3
+
+
+test_tensor_shapes_string = '{"view_as_complex": "{2, 2}"}'
+test_tensor_shape_json: Dict[str, str] = json.loads(test_tensor_shapes_string)
+
+
+def test_tensor_shape(op_name: str) -> str:
+    if op_name in test_tensor_shape_json:
+        return test_tensor_shape_json[op_name]
+    else:
+        return ""
+
+
+def test_value_expression(
+    arg_type: Union[BaseType, OptionalType, Type], index: int, op_name: str
+) -> str:
+    tensor_size_ex = test_tensor_shape(op_name)
+    if tensor_size_ex == "":
+        num_tensors = 16 if index == 0 else 64
+        num_dim = test_tensor_dim(op_name)
+        size_per_dim = math.ceil(num_tensors / float(num_dim))
+        size_per_dim += size_per_dim % 2
+        tensor_size_ex = "{%s}" % (",".join([f"{size_per_dim}"] * num_dim))
+    if should_use_int_tensor(op_name):
+        tensor_expression = f"at::randint(1, 100, {tensor_size_ex}, at::kInt)"
+    elif should_use_complex_tensor(op_name):
+        tensor_expression = f"at::randn({tensor_size_ex}, at::kComplexFloat)"
+    else:
+        tensor_expression = f"at::rand({tensor_size_ex})"
+
+    value_expressions = {
+        BaseTy.Tensor: tensor_expression,
+        BaseTy.int: "1",
+        BaseTy.bool: "false",
+        BaseTy.Scalar: "2",
+        BaseTy.ScalarType: "at::ScalarType::Float",
+        BaseTy.str: '"floor"',
+    }
+
+    base_ty_object = None
+    if isinstance(arg_type, BaseType):
+        base_ty_object = arg_type.name
+    else:
+        assert isinstance(arg_type, OptionalType) and isinstance(
+            arg_type.elem, BaseType
+        )
+        base_ty_object = arg_type.elem.name
+    assert base_ty_object in value_expressions, "not expected type"
+    value_expression = value_expressions[base_ty_object]
+    return value_expression
+
+
+def generate_test_value_definitions(schema: FunctionSchema, index: int) -> str:
+    assert not schema.is_out_fn()
+    schema_name = schema.name.name.base
+    arg_map = {}
+    for arg in schema.schema_order_arguments():
+        test_value_exp = test_value_expression(arg.type, index, schema_name)
+        arg_map[arg.name] = test_value_exp
+    config.override_test_values(arg_map, schema_name, index)
+    arg_populations = []
+    for arg_name, arg_value in arg_map.items():
+        arg_populations.append(f"auto {arg_name}{index} = {arg_value}")
+    return ";\n    ".join(arg_populations) + ";"
+
+
+def generate_test_value_names(schema: FunctionSchema, index: int) -> str:
+    assert not schema.is_out_fn()
+    return ",".join(f"{arg.name}{index}" for arg in schema.schema_order_arguments())
+
+
+generate_test_ir_arguments_base_ty_to_type_str_ = {
+    BaseTy.Tensor: "Tensor",
+    BaseTy.int: "int",
+    BaseTy.float: "float",
+    BaseTy.str: "str",
+    BaseTy.Scalar: "int",
+    BaseTy.ScalarType: "int",
+    BaseTy.bool: "bool",
+}
+
+
+def generate_test_ir_arguments(
+    schema: FunctionSchema,
+) -> List[Tuple[str, Optional[str]]]:
+    def ir_argument(arg: Argument) -> Tuple[str, Optional[str]]:
+        t = arg.type
+        add_optional = False
+        if isinstance(t, OptionalType):
+            t = t.elem
+            add_optional = True
+        assert isinstance(t, BaseType)
+        type_str = None
+        if t.name in generate_test_ir_arguments_base_ty_to_type_str_:
+            type_str = generate_test_ir_arguments_base_ty_to_type_str_[t.name]
+        if type_str and add_optional:
+            type_str = f"{type_str}?"
+        return ("%" + arg.name, type_str)
+
+    return [ir_argument(arg) for arg in schema.schema_order_arguments()]
+
+
+def generate_arg_extraction(schema: FunctionSchema) -> str:
+    arg_populations = []
+    for i, arg in enumerate(schema.schema_order_arguments()):
+        maybe_method = ivalue_type_conversion_method(arg.type)
+        assert maybe_method
+        is_reference, type_conversion_method = maybe_method
+        reference = "&" if is_reference else ""
+        arg_populations.append(
+            f"const auto{reference} {arg.name} = p_node->Input({i}).{type_conversion_method}"
+        )
+    return ";\n    ".join(arg_populations) + ";"
+
+
+def get_kernel_name(g: NativeFunctionsGroup, backend_index: BackendIndex) -> str:
+    kernel = backend_index.get_kernel(g.functional)
+    if g.structured or kernel is None:
+        return cpp.name(g.functional.func)
+    return kernel.kernel
+
+
+def get_out_kernel_name(g: NativeFunctionsGroup, backend_index: BackendIndex) -> str:
+    kernel = backend_index.get_kernel(g.out)
+    if g.structured or kernel is None:
+        return cpp.name(g.out.func)
+    return kernel.kernel
+
+
+def generate_non_out_variant_call(
+    g: NativeFunctionsGroup, backend_index: BackendIndex
+) -> str:
+    schema = g.functional.func
+    assert not schema.is_out_fn()
+    kernel_name = get_kernel_name(g, backend_index)
+    arg_names = (arg.name for arg in schema.schema_order_arguments())
+    namespace_name = "cpu" if g.structured else "native"
+    return f'at::{namespace_name}::{kernel_name}({",".join(arg_names)})'
+
+
+def generate_call_to_view_ops(
+    g: NativeFunctionsViewGroup, backend_index: BackendIndex
+) -> str:
+    schema = g.view.func
+    kernel_name = cpp.name(schema)
+    kernel = backend_index.get_kernel(g.view)
+    if kernel:
+        kernel_name = kernel.kernel
+    arg_names = (arg.name for arg in schema.schema_order_arguments())
+    namespace_name = "native"
+    return f'at::{namespace_name}::{kernel_name}({",".join(arg_names)})'
+
+
+def generate_out_variant_call(
+    g: NativeFunctionsGroup, backend_index: BackendIndex
+) -> str:
+    schema = g.out.func
+    assert schema.is_out_fn()
+    arg_names = []
+    kernel_name = get_out_kernel_name(g, backend_index)
+    if g.structured:
+        # structured op starts with the output tensor argument.
+        arg_names = [out_arg.name for out_arg in schema.arguments.out]
+    else:
+        arg_names = []
+    for arg in schema.arguments.non_out:
+        if isinstance(arg, SelfArgument):
+            arg_names.append(arg.argument.name)
+        else:
+            assert isinstance(arg, Argument)
+            arg_names.append(arg.name)
+    if not g.structured:
+        assert len(schema.arguments.out) == 1
+        arg_names.append(schema.arguments.out[0].name)
+    cpp_arg_names = ",".join(arg_names)
+    namespace_name = "cpu" if g.structured else "native"
+    return f"at::{namespace_name}::{kernel_name}({cpp_arg_names})"
+
+
+no_memory_resize_ops = frozenset(
+    (
+        "isin.Scalar_Tensor",
+        "index_add",
+        "dot",
+        "vdot",
+        "nuclear_norm",
+        "histc",
+        "l1_loss",
+        "multi_margin_loss",
+        "multilabel_margin_loss",
+        "nll_loss",
+        "nll_loss2d",
+        "prod",
+    )
+)
+
+
+def should_check_resize(schema: FunctionSchema) -> bool:
+    schema_str = str(schema)
+    type_variant_op_name = schema_str[: schema_str.find("(")]
+    return type_variant_op_name not in no_memory_resize_ops
+
+
+def op_name_from_group(g: NativeFunctionsGroup) -> str:
+    return g.functional.func.name.name.base
+
+
+class GenOpDispatcher:
+    def out_variant(
+        self, groups: Sequence[NativeFunctionsGroup], backend_index: BackendIndex
+    ) -> str:
+        if not groups:
+            return ""
+        generated_type_variants = []
+        for g in groups:
+            with native_function_manager(g):
+                assert is_supported(g)
+                assert isinstance(g, NativeFunctionsGroup)
+                generated_type_variant = self.out_variant_op_generator(g, backend_index)
+                generated_type_variants.append(generated_type_variant)
+        op_name = op_name_from_group(groups[0])
+        body = "\n".join(generated_type_variants)
+        generated = f"""
+REGISTER_OPERATOR_FUNCTOR(
+    aten::{op_name},
+    aten_{op_name},
+    [](Node* n) -> SROperator {{
+      {body}
+      LogAndDumpSchema(n);
+      return nullptr;
+    }});
+"""
+        return generated
+
+    def view(
+        self, groups: Sequence[NativeFunctionsViewGroup], backend_index: BackendIndex
+    ) -> str:
+        if not groups:
+            return ""
+        generated_type_variants = []
+        for g in groups:
+            with native_function_manager(g):
+                assert is_supported(g)
+                assert isinstance(g, NativeFunctionsViewGroup)
+                generated_type_variant = self.view_op_generator(g, backend_index)
+                generated_type_variants.append(generated_type_variant)
+        op_name = config.func_name_base_str(groups[0])
+        body = "\n".join(generated_type_variants)
+        generated = f"""
+REGISTER_NATIVE_OPERATOR_FUNCTOR(
+    aten::{op_name},
+    aten_{op_name},
+    [](Node* n) -> SROperator {{
+      {body}
+      LogAndDumpSchema(n);
+      return nullptr;
+    }});
+"""
+        return generated
+
+    def out_variant_op_generator(
+        self, g: NativeFunctionsGroup, backend_index: BackendIndex
+    ) -> str:
+        functional = g.functional
+        schema = str(functional.func)
+        populated_argument = generate_arg_extraction(g.functional.func)
+        functional_variant_call = generate_non_out_variant_call(g, backend_index)
+        assert len(g.out.func.arguments.out) == 1
+        out_variable_name = str(g.out.func.arguments.out[0].name)
+        out_variant_call = generate_out_variant_call(g, backend_index)
+        generated = f"""
+      if (n->matches(torch::schema("aten::{schema}"))) {{
+        return [](ProcessedNode* p_node) {{
+          {populated_argument}
+          if (p_node->Output(0).isNone()) {{
+            p_node->Output(0) = {functional_variant_call};
+            return;
+          }}
+          auto& {out_variable_name} = p_node->Output(0).toTensor();
+          fastResizeToZero({out_variable_name});
+          {out_variant_call};
+        }};
+      }}"""
+        return generated
+
+    def view_op_generator(
+        self, g: NativeFunctionsViewGroup, backend_index: BackendIndex
+    ) -> str:
+        schema = str(g.view.func)
+        populated_argument = generate_arg_extraction(g.view.func)
+        functional_variant_call = generate_call_to_view_ops(g, backend_index)
+        generated = f"""
+      if (n->matches(torch::schema("aten::{schema}"))) {{
+        return [](ProcessedNode* p_node) {{
+          {populated_argument}
+            p_node->Output(0) = {functional_variant_call};
+        }};
+      }}"""
+        return generated
+
+
+class GenOpTestCase:
+    def out_variant(self, groups: Sequence[NativeFunctionsGroup]) -> str:
+        if not groups:
+            return ""
+        generated_type_variants = []
+        for g in groups:
+            with native_function_manager(g):
+                assert is_supported(g)
+                assert isinstance(g, NativeFunctionsGroup)
+                generated_type_variant = self.out_variant_op_test_case_generator(g)
+                generated_type_variants.append(generated_type_variant)
+        return "\n".join(generated_type_variants)
+
+    def view(self, groups: Sequence[NativeFunctionsViewGroup]) -> str:
+        if not groups:
+            return ""
+        generated_type_variants = []
+        for g in groups:
+            with native_function_manager(g):
+                assert is_supported(g)
+                assert isinstance(g, NativeFunctionsViewGroup)
+                generated_type_variant = self.view_op_test_case_generator(g)
+                generated_type_variants.append(generated_type_variant)
+        return "\n".join(generated_type_variants)
+
+    def out_variant_op_test_case_generator(self, g: NativeFunctionsGroup) -> str:
+        schema = g.functional.func
+        schema_str = str(schema)
+        assert schema_str.find("(") > 0
+        type_variant_op_name = schema_str[: schema_str.find("(")].replace(".", "_")
+        op_name = op_name_from_group(g)
+        assert type_variant_op_name.startswith(op_name)
+
+        arg_types = generate_test_ir_arguments(schema)
+        arg_declarations = ", ".join(
+            (
+                arg_name if arg_type is None else f"{arg_name}: {arg_type}"
+                for arg_name, arg_type in arg_types
+            )
+        )
+        arg_names = ", ".join((arg_name for arg_name, _ in arg_types))
+        assert (
+            len(schema.returns) == 1
+            and isinstance(schema.returns[0].type, BaseType)
+            and schema.returns[0].type.name is BaseTy.Tensor
+        )
+        test_value_definitions = generate_test_value_definitions(schema, 0)
+        test_value_names = generate_test_value_names(schema, 0)
+        test_value_definitions2 = generate_test_value_definitions(schema, 1)
+        test_value_names2 = generate_test_value_names(schema, 1)
+        check_resize = "true" if should_check_resize(schema) else "false"
+        generated = f"""
+TEST(StaticRuntime, autogen_{type_variant_op_name}) {{
+  const std::string script = R"IR(
+    graph({arg_declarations}):
+        %bias: None = prim::Constant()
+        %ret = aten::{op_name}({arg_names})
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  {test_value_definitions}
+  std::vector<IValue> args{{{test_value_names}}};
+  testStaticRuntime(script, args, {{}}, /*use_allclose=*/false, /*use_equalnan=*/false, /*check_resize=*/{check_resize});
+
+  {test_value_definitions2}
+  std::vector<IValue> args2{{{test_value_names2}}};
+  testStaticRuntime(script, args, args2, /*use_allclose=*/false, /*use_equalnan=*/false, /*check_resize=*/{check_resize});
+
+}}
+"""
+        return generated
+
+    def view_op_test_case_generator(self, g: NativeFunctionsViewGroup) -> str:
+        schema = g.view.func
+        schema_str = str(schema)
+        assert schema_str.find("(") > 0
+        type_variant_op_name = schema_str[: schema_str.find("(")].replace(".", "_")
+        op_name = g.view.root_name
+        assert type_variant_op_name.startswith(op_name)
+
+        arg_types = generate_test_ir_arguments(schema)
+        arg_declarations = ", ".join(
+            (
+                arg_name if arg_type is None else f"{arg_name}: {arg_type}"
+                for arg_name, arg_type in arg_types
+            )
+        )
+        arg_names = ", ".join((arg_name for arg_name, _ in arg_types))
+        assert (
+            len(schema.returns) == 1
+            and isinstance(schema.returns[0].type, BaseType)
+            and schema.returns[0].type.name is BaseTy.Tensor
+        )
+        test_value_definitions = generate_test_value_definitions(schema, 0)
+        test_value_names = generate_test_value_names(schema, 0)
+        generated = f"""
+TEST(StaticRuntime, autogen_{type_variant_op_name}) {{
+  const std::string script = R"IR(
+    graph({arg_declarations}):
+        %bias: None = prim::Constant()
+        %ret = aten::{op_name}({arg_names})
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  {test_value_definitions}
+  std::vector<IValue> args{{{test_value_names}}};
+  testStaticRuntime(script, args);
+}}
+"""
+
+        return generated
diff --git a/MLPY/Lib/site-packages/torchgen/utils.py b/MLPY/Lib/site-packages/torchgen/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f5eef969f39e13bab118ed03560fbf033724e6c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/utils.py
@@ -0,0 +1,499 @@
+import contextlib
+import functools
+import hashlib
+import os
+import re
+import sys
+import textwrap
+from argparse import Namespace
+from dataclasses import fields, is_dataclass
+from enum import auto, Enum
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    NoReturn,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+from typing_extensions import Self
+
+from torchgen.code_template import CodeTemplate
+
+
+# Many of these functions share logic for defining both the definition
+# and declaration (for example, the function signature is the same), so
+# we organize them into one function that takes a Target to say which
+# code we want.
+#
+# This is an OPEN enum (we may add more cases to it in the future), so be sure
+# to explicitly specify with Literal[Target.XXX] or Literal[Target.XXX, Target.YYY]
+# what targets are valid for your use.
+class Target(Enum):
+    # top level namespace (not including at)
+    DEFINITION = auto()
+    DECLARATION = auto()
+    # TORCH_LIBRARY(...) { ... }
+    REGISTRATION = auto()
+    # namespace { ... }
+    ANONYMOUS_DEFINITION = auto()
+    # namespace cpu { ... }
+    NAMESPACED_DEFINITION = auto()
+    NAMESPACED_DECLARATION = auto()
+
+
+# Matches "foo" in "foo, bar" but not "foobar". Used to search for the
+# occurrence of a parameter in the derivative formula
+IDENT_REGEX = r"(^|\W){}($|\W)"
+
+
+# TODO: Use a real parser here; this will get bamboozled
+def split_name_params(schema: str) -> Tuple[str, List[str]]:
+    m = re.match(r"(\w+)(\.\w+)?\((.*)\)", schema)
+    if m is None:
+        raise RuntimeError(f"Unsupported function schema: {schema}")
+    name, _, params = m.groups()
+    return name, params.split(", ")
+
+
+T = TypeVar("T")
+S = TypeVar("S")
+
+# These two functions purposely return generators in analogy to map()
+# so that you don't mix up when you need to list() them
+
+
+# Map over function that may return None; omit Nones from output sequence
+def mapMaybe(func: Callable[[T], Optional[S]], xs: Iterable[T]) -> Iterator[S]:
+    for x in xs:
+        r = func(x)
+        if r is not None:
+            yield r
+
+
+# Map over function that returns sequences and cat them all together
+def concatMap(func: Callable[[T], Sequence[S]], xs: Iterable[T]) -> Iterator[S]:
+    for x in xs:
+        yield from func(x)
+
+
+# Conveniently add error context to exceptions raised.  Lets us
+# easily say that an error occurred while processing a specific
+# context.
+@contextlib.contextmanager
+def context(msg_fn: Callable[[], str]) -> Iterator[None]:
+    try:
+        yield
+    except Exception as e:
+        # TODO: this does the wrong thing with KeyError
+        msg = msg_fn()
+        msg = textwrap.indent(msg, "  ")
+        msg = f"{e.args[0]}\n{msg}" if e.args else msg
+        e.args = (msg,) + e.args[1:]
+        raise
+
+
+# A little trick from https://github.com/python/mypy/issues/6366
+# for getting mypy to do exhaustiveness checking
+# TODO: put this somewhere else, maybe
+def assert_never(x: NoReturn) -> NoReturn:
+    raise AssertionError(f"Unhandled type: {type(x).__name__}")
+
+
+@functools.lru_cache(maxsize=None)
+def _read_template(template_fn: str) -> CodeTemplate:
+    return CodeTemplate.from_file(template_fn)
+
+
+# String hash that's stable across different executions, unlike builtin hash
+def string_stable_hash(s: str) -> int:
+    sha1 = hashlib.sha1(s.encode("latin1")).digest()
+    return int.from_bytes(sha1, byteorder="little")
+
+
+# A small abstraction for writing out generated files and keeping track
+# of what files have been written (so you can write out a list of output
+# files)
+class FileManager:
+    install_dir: str
+    template_dir: str
+    dry_run: bool
+    filenames: Set[str]
+
+    def __init__(self, install_dir: str, template_dir: str, dry_run: bool) -> None:
+        self.install_dir = install_dir
+        self.template_dir = template_dir
+        self.filenames = set()
+        self.dry_run = dry_run
+
+    def _write_if_changed(self, filename: str, contents: str) -> None:
+        old_contents: Optional[str]
+        try:
+            with open(filename) as f:
+                old_contents = f.read()
+        except OSError:
+            old_contents = None
+        if contents != old_contents:
+            # Create output directory if it doesn't exist
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            with open(filename, "w") as f:
+                f.write(contents)
+
+    # Read from template file and replace pattern with callable (type could be dict or str).
+    def substitute_with_template(
+        self, template_fn: str, env_callable: Callable[[], Union[str, Dict[str, Any]]]
+    ) -> str:
+        template_path = os.path.join(self.template_dir, template_fn)
+        env = env_callable()
+        if isinstance(env, dict):
+            # TODO: Update the comment reference to the correct location
+            if "generated_comment" not in env:
+                comment = "@" + "generated by torchgen/gen.py"
+                comment += f" from {os.path.basename(template_path)}"
+                env["generated_comment"] = comment
+            template = _read_template(template_path)
+            return template.substitute(env)
+        elif isinstance(env, str):
+            return env
+        else:
+            assert_never(env)
+
+    def write_with_template(
+        self,
+        filename: str,
+        template_fn: str,
+        env_callable: Callable[[], Union[str, Dict[str, Any]]],
+    ) -> None:
+        filename = f"{self.install_dir}/{filename}"
+        assert filename not in self.filenames, "duplicate file write {filename}"
+        self.filenames.add(filename)
+        if not self.dry_run:
+            substitute_out = self.substitute_with_template(
+                template_fn=template_fn,
+                env_callable=env_callable,
+            )
+            self._write_if_changed(filename=filename, contents=substitute_out)
+
+    def write(
+        self,
+        filename: str,
+        env_callable: Callable[[], Union[str, Dict[str, Any]]],
+    ) -> None:
+        self.write_with_template(filename, filename, env_callable)
+
+    def write_sharded(
+        self,
+        filename: str,
+        items: Iterable[T],
+        *,
+        key_fn: Callable[[T], str],
+        env_callable: Callable[[T], Dict[str, List[str]]],
+        num_shards: int,
+        base_env: Optional[Dict[str, Any]] = None,
+        sharded_keys: Set[str],
+    ) -> None:
+        everything: Dict[str, Any] = {"shard_id": "Everything"}
+        shards: List[Dict[str, Any]] = [
+            {"shard_id": f"_{i}"} for i in range(num_shards)
+        ]
+        all_shards = [everything] + shards
+
+        if base_env is not None:
+            for shard in all_shards:
+                shard.update(base_env)
+
+        for key in sharded_keys:
+            for shard in all_shards:
+                if key in shard:
+                    assert isinstance(
+                        shard[key], list
+                    ), "sharded keys in base_env must be a list"
+                    shard[key] = shard[key].copy()
+                else:
+                    shard[key] = []
+
+        def merge_env(into: Dict[str, List[str]], from_: Dict[str, List[str]]) -> None:
+            for k, v in from_.items():
+                assert k in sharded_keys, f"undeclared sharded key {k}"
+                into[k] += v
+
+        if self.dry_run:
+            # Dry runs don't write any templates, so incomplete environments are fine
+            items = ()
+
+        for item in items:
+            key = key_fn(item)
+            sid = string_stable_hash(key) % num_shards
+            env = env_callable(item)
+
+            merge_env(shards[sid], env)
+            merge_env(everything, env)
+
+        dot_pos = filename.rfind(".")
+        if dot_pos == -1:
+            dot_pos = len(filename)
+        base_filename = filename[:dot_pos]
+        extension = filename[dot_pos:]
+
+        for shard in all_shards:
+            shard_id = shard["shard_id"]
+            self.write_with_template(
+                f"{base_filename}{shard_id}{extension}", filename, lambda: shard
+            )
+
+        # filenames is used to track compiled files, but FooEverything.cpp isn't meant to be compiled
+        self.filenames.discard(
+            f"{self.install_dir}/{base_filename}Everything{extension}"
+        )
+
+    def write_outputs(self, variable_name: str, filename: str) -> None:
+        """Write a file containing the list of all outputs which are
+        generated by this script."""
+        content = "set({}\n    {})".format(
+            variable_name,
+            "\n    ".join('"' + name + '"' for name in sorted(self.filenames)),
+        )
+        self._write_if_changed(filename, content)
+
+    def template_dir_for_comments(self) -> str:
+        """
+        This needs to be deterministic. The template dir is an absolute path
+        that varies across builds. So, just use the path relative to this file,
+        which will point to the codegen source but will be stable.
+        """
+        return os.path.relpath(self.template_dir, os.path.dirname(__file__))
+
+
+# Helper function to generate file manager
+def make_file_manager(
+    options: Namespace, install_dir: Optional[str] = None
+) -> FileManager:
+    template_dir = os.path.join(options.source_path, "templates")
+    install_dir = install_dir if install_dir else options.install_dir
+    return FileManager(
+        install_dir=install_dir, template_dir=template_dir, dry_run=options.dry_run
+    )
+
+
+# Helper function to create a pretty representation for dataclasses
+def dataclass_repr(
+    obj: Any,
+    indent: int = 0,
+    width: int = 80,
+) -> str:
+    # built-in pprint module support dataclasses from python 3.10
+    if sys.version_info >= (3, 10):
+        from pprint import pformat
+
+        return pformat(obj, indent, width)
+
+    return _pformat(obj, indent=indent, width=width)
+
+
+def _pformat(
+    obj: Any,
+    indent: int,
+    width: int,
+    curr_indent: int = 0,
+) -> str:
+    assert is_dataclass(obj), f"obj should be a dataclass, received: {type(obj)}"
+
+    class_name = obj.__class__.__name__
+    # update current indentation level with class name
+    curr_indent += len(class_name) + 1
+
+    fields_list = [(f.name, getattr(obj, f.name)) for f in fields(obj) if f.repr]
+
+    fields_str = []
+    for name, attr in fields_list:
+        # update the current indent level with the field name
+        # dict, list, set and tuple also add indent as done in pprint
+        _curr_indent = curr_indent + len(name) + 1
+        if is_dataclass(attr):
+            str_repr = _pformat(attr, indent, width, _curr_indent)
+        elif isinstance(attr, dict):
+            str_repr = _format_dict(attr, indent, width, _curr_indent)
+        elif isinstance(attr, (list, set, tuple)):
+            str_repr = _format_list(attr, indent, width, _curr_indent)
+        else:
+            str_repr = repr(attr)
+
+        fields_str.append(f"{name}={str_repr}")
+
+    indent_str = curr_indent * " "
+    body = f",\n{indent_str}".join(fields_str)
+    return f"{class_name}({body})"
+
+
+def _format_dict(
+    attr: Dict[Any, Any],
+    indent: int,
+    width: int,
+    curr_indent: int,
+) -> str:
+    curr_indent += indent + 3
+    dict_repr = []
+    for k, v in attr.items():
+        k_repr = repr(k)
+        v_str = (
+            _pformat(v, indent, width, curr_indent + len(k_repr))
+            if is_dataclass(v)
+            else repr(v)
+        )
+        dict_repr.append(f"{k_repr}: {v_str}")
+
+    return _format(dict_repr, indent, width, curr_indent, "{", "}")
+
+
+def _format_list(
+    attr: Union[List[Any], Set[Any], Tuple[Any, ...]],
+    indent: int,
+    width: int,
+    curr_indent: int,
+) -> str:
+    curr_indent += indent + 1
+    list_repr = [
+        _pformat(l, indent, width, curr_indent) if is_dataclass(l) else repr(l)
+        for l in attr
+    ]
+    start, end = ("[", "]") if isinstance(attr, list) else ("(", ")")
+    return _format(list_repr, indent, width, curr_indent, start, end)
+
+
+def _format(
+    fields_str: List[str],
+    indent: int,
+    width: int,
+    curr_indent: int,
+    start: str,
+    end: str,
+) -> str:
+    delimiter, curr_indent_str = "", ""
+    # if it exceed the max width then we place one element per line
+    if len(repr(fields_str)) >= width:
+        delimiter = "\n"
+        curr_indent_str = " " * curr_indent
+
+    indent_str = " " * indent
+    body = f", {delimiter}{curr_indent_str}".join(fields_str)
+    return f"{start}{indent_str}{body}{end}"
+
+
+class NamespaceHelper:
+    """A helper for constructing the namespace open and close strings for a nested set of namespaces.
+
+    e.g. for namespace_str torch::lazy,
+
+    prologue:
+    namespace torch {
+    namespace lazy {
+
+    epilogue:
+    } // namespace lazy
+    } // namespace torch
+    """
+
+    def __init__(self, namespace_str: str, entity_name: str = "", max_level: int = 2):
+        # cpp_namespace can be a colon joined string such as torch::lazy
+        cpp_namespaces = namespace_str.split("::")
+        assert (
+            len(cpp_namespaces) <= max_level
+        ), f"Codegen doesn't support more than {max_level} level(s) of custom namespace. Got {namespace_str}."
+        self.cpp_namespace_ = namespace_str
+        self.prologue_ = "\n".join([f"namespace {n} {{" for n in cpp_namespaces])
+        self.epilogue_ = "\n".join(
+            [f"}} // namespace {n}" for n in reversed(cpp_namespaces)]
+        )
+        self.namespaces_ = cpp_namespaces
+        self.entity_name_ = entity_name
+
+    @staticmethod
+    def from_namespaced_entity(
+        namespaced_entity: str, max_level: int = 2
+    ) -> "NamespaceHelper":
+        """
+        Generate helper from nested namespaces as long as class/function name. E.g.: "torch::lazy::add"
+        """
+        names = namespaced_entity.split("::")
+        entity_name = names[-1]
+        namespace_str = "::".join(names[:-1])
+        return NamespaceHelper(
+            namespace_str=namespace_str, entity_name=entity_name, max_level=max_level
+        )
+
+    @property
+    def prologue(self) -> str:
+        return self.prologue_
+
+    @property
+    def epilogue(self) -> str:
+        return self.epilogue_
+
+    @property
+    def entity_name(self) -> str:
+        return self.entity_name_
+
+    # Only allow certain level of namespaces
+    def get_cpp_namespace(self, default: str = "") -> str:
+        """
+        Return the namespace string from joining all the namespaces by "::" (hence no leading "::").
+        Return default if namespace string is empty.
+        """
+        return self.cpp_namespace_ if self.cpp_namespace_ else default
+
+
+class OrderedSet(Generic[T]):
+    storage: Dict[T, Literal[None]]
+
+    def __init__(self, iterable: Optional[Iterable[T]] = None):
+        if iterable is None:
+            self.storage = {}
+        else:
+            self.storage = dict.fromkeys(iterable)
+
+    def __contains__(self, item: T) -> bool:
+        return item in self.storage
+
+    def __iter__(self) -> Iterator[T]:
+        return iter(self.storage.keys())
+
+    def update(self, items: "OrderedSet[T]") -> None:
+        self.storage.update(items.storage)
+
+    def add(self, item: T) -> None:
+        self.storage[item] = None
+
+    def copy(self) -> "OrderedSet[T]":
+        ret: OrderedSet[T] = OrderedSet()
+        ret.storage = self.storage.copy()
+        return ret
+
+    @staticmethod
+    def union(*args: "OrderedSet[T]") -> "OrderedSet[T]":
+        ret = args[0].copy()
+        for s in args[1:]:
+            ret.update(s)
+        return ret
+
+    def __or__(self, other: "OrderedSet[T]") -> "OrderedSet[T]":
+        return OrderedSet.union(self, other)
+
+    def __ior__(self, other: "OrderedSet[T]") -> Self:
+        self.update(other)
+        return self
+
+    def __eq__(self, other: object) -> bool:
+        if isinstance(other, OrderedSet):
+            return self.storage == other.storage
+        else:
+            return set(self.storage.keys()) == other
diff --git a/MLPY/Lib/site-packages/torchgen/yaml_utils.py b/MLPY/Lib/site-packages/torchgen/yaml_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3272a5cdf7ae3a2fa32f7ad4c64f739eb239eff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torchgen/yaml_utils.py
@@ -0,0 +1,26 @@
+# Safely load fast C Yaml loader/dumper if they are available
+try:
+    from yaml import CSafeLoader as Loader
+except ImportError:
+    from yaml import SafeLoader as Loader  # type: ignore[assignment, misc]
+
+try:
+    from yaml import CSafeDumper as Dumper
+except ImportError:
+    from yaml import SafeDumper as Dumper  # type: ignore[assignment, misc]
+YamlDumper = Dumper
+
+
+# A custom loader for YAML that errors on duplicate keys.
+# This doesn't happen by default: see https://github.com/yaml/pyyaml/issues/165
+class YamlLoader(Loader):
+    def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
+        mapping = []
+        for key_node, value_node in node.value:
+            key = self.construct_object(key_node, deep=deep)  # type: ignore[no-untyped-call]
+            assert (
+                key not in mapping
+            ), f"Found a duplicate key in the yaml. key={key}, line={node.start_mark.line}"
+            mapping.append(key)
+        mapping = super().construct_mapping(node, deep=deep)  # type: ignore[no-untyped-call]
+        return mapping