0-hero
/

gpt2-pos-encoding-experiment-10B

Model card Files Files and versions Community

0-hero commited on Sep 27, 2024

Commit

9b31431

verified ·

1 Parent(s): afafe68

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.cache/pip/http-v2/8/8/b/8/b/88b8bfb8f1d620e081b2b226e7936019ed96f3fadbfbd878f24e6be7.body +119 -0
.cache/pip/http-v2/f/6/8/9/5/f689581dbe86ea3c1a14226c252116f97e87dde2c835d7d60fd42b59.body +280 -0
.local/share/jupyter/nbextensions/code_font_size/code_font_size.js +70 -0
.local/share/jupyter/nbextensions/code_prettify/README_code_prettify.md +300 -0
.local/share/jupyter/nbextensions/code_prettify/demo_2to3.gif +0 -0
.local/share/jupyter/nbextensions/code_prettify/isort.yaml +40 -0
.local/share/jupyter/nbextensions/codefolding/codefolding.yaml +17 -0
.local/share/jupyter/nbextensions/codefolding/codefolding_editor.yaml +18 -0
.local/share/jupyter/nbextensions/codefolding/codefolding_indent_folded_2.png +0 -0
.local/share/jupyter/nbextensions/codefolding/firstline-fold.js +14 -0
.local/share/jupyter/nbextensions/codefolding/magic-fold.js +14 -0
.local/share/jupyter/nbextensions/codefolding/magic-unfolded.png +0 -0
.local/share/jupyter/nbextensions/codemirror_mode_extensions/main.js +11 -0
.local/share/jupyter/nbextensions/collapsible_headings/main.css +130 -0
.local/share/jupyter/nbextensions/collapsible_headings/screenshot.png +0 -0
.local/share/jupyter/nbextensions/comment-uncomment/main.js +63 -0
.local/share/jupyter/nbextensions/datestamper/readme.md +6 -0
.local/share/jupyter/nbextensions/equation-numbering/button.png +0 -0
.local/share/jupyter/nbextensions/execute_time/execution-timings-menu.png +0 -0
.local/share/jupyter/nbextensions/exercise/exercise.yaml +28 -0
.local/share/jupyter/nbextensions/exercise/main.css +13 -0
.local/share/jupyter/nbextensions/exercise/readme.md +52 -0
.local/share/jupyter/nbextensions/help_panel/help_panel_ext_fullscreen.png +0 -0
.local/share/jupyter/nbextensions/help_panel/readme.md +15 -0
.local/share/jupyter/nbextensions/hide_input/hide-input.yaml +7 -0
.local/share/jupyter/nbextensions/hide_input/main.js +54 -0
.local/share/jupyter/nbextensions/hide_input_all/hide_input_all_hide.png +0 -0
.local/share/jupyter/nbextensions/hide_input_all/icon.png +0 -0
.local/share/jupyter/nbextensions/hide_input_all/main.js +59 -0
.local/share/jupyter/nbextensions/highlight_selected_word/README.md +117 -0
.local/share/jupyter/nbextensions/highlight_selected_word/configurator.yaml +131 -0
.local/share/jupyter/nbextensions/highlighter/demo_highlighter.ipynb +96 -0
.local/share/jupyter/nbextensions/keyboard_shortcut_editor/icon.png +0 -0
.local/share/jupyter/nbextensions/keyboard_shortcut_editor/readme_undefined_key.png +0 -0
.local/share/jupyter/nbextensions/load_tex_macros/main.js +39 -0
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin +0 -0
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx +651 -0
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir +60 -0
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir +53 -0
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.cubin +0 -0
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir +162 -0
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx +338 -0
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx +572 -0
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin +0 -0
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir +243 -0
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir +58 -0
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ttgir +49 -0
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ttir +48 -0
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.llir +362 -0
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir +278 -0

.cache/pip/http-v2/8/8/b/8/b/88b8bfb8f1d620e081b2b226e7936019ed96f3fadbfbd878f24e6be7.body ADDED Viewed

	@@ -0,0 +1,119 @@

+Metadata-Version: 2.1
+Name: requests
+Version: 2.32.3
+Summary: Python HTTP for Humans.
+Home-page: https://requests.readthedocs.io
+Author: Kenneth Reitz
+Author-email: [email protected]
+License: Apache-2.0
+Project-URL: Documentation, https://requests.readthedocs.io
+Project-URL: Source, https://github.com/psf/requests
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Web Environment
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Software Development :: Libraries
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: charset-normalizer <4,>=2
+Requires-Dist: idna <4,>=2.5
+Requires-Dist: urllib3 <3,>=1.21.1
+Requires-Dist: certifi >=2017.4.17
+Provides-Extra: security
+Provides-Extra: socks
+Requires-Dist: PySocks !=1.5.7,>=1.5.6 ; extra == 'socks'
+Provides-Extra: use_chardet_on_py3
+Requires-Dist: chardet <6,>=3.0.2 ; extra == 'use_chardet_on_py3'
+# Requests
+**Requests** is a simple, yet elegant, HTTP library.
+```python
+>>> import requests
+>>> r = requests.get('https://httpbin.org/basic-auth/user/pass', auth=('user', 'pass'))
+>>> r.status_code
+200
+>>> r.headers['content-type']
+'application/json; charset=utf8'
+>>> r.encoding
+'utf-8'
+>>> r.text
+'{"authenticated": true, ...'
+>>> r.json()
+{'authenticated': True, ...}
+```
+Requests allows you to send HTTP/1.1 requests extremely easily. There’s no need to manually add query strings to your URLs, or to form-encode your `PUT` & `POST` data — but nowadays, just use the `json` method!
+Requests is one of the most downloaded Python packages today, pulling in around `30M downloads / week`— according to GitHub, Requests is currently [depended upon](https://github.com/psf/requests/network/dependents?package_id=UGFja2FnZS01NzA4OTExNg%3D%3D) by `1,000,000+` repositories. You may certainly put your trust in this code.
+[![Downloads](https://static.pepy.tech/badge/requests/month)](https://pepy.tech/project/requests)
+[![Supported Versions](https://img.shields.io/pypi/pyversions/requests.svg)](https://pypi.org/project/requests)
+[![Contributors](https://img.shields.io/github/contributors/psf/requests.svg)](https://github.com/psf/requests/graphs/contributors)
+## Installing Requests and Supported Versions
+Requests is available on PyPI:
+```console
+$ python -m pip install requests
+```
+Requests officially supports Python 3.8+.
+## Supported Features & Best–Practices
+Requests is ready for the demands of building robust and reliable HTTP–speaking applications, for the needs of today.
+- Keep-Alive & Connection Pooling
+- International Domains and URLs
+- Sessions with Cookie Persistence
+- Browser-style TLS/SSL Verification
+- Basic & Digest Authentication
+- Familiar `dict`–like Cookies
+- Automatic Content Decompression and Decoding
+- Multi-part File Uploads
+- SOCKS Proxy Support
+- Connection Timeouts
+- Streaming Downloads
+- Automatic honoring of `.netrc`
+- Chunked HTTP Requests
+## API Reference and User Guide available on [Read the Docs](https://requests.readthedocs.io)
+[![Read the Docs](https://raw.githubusercontent.com/psf/requests/main/ext/ss.png)](https://requests.readthedocs.io)
+## Cloning the repository
+When cloning the Requests repository, you may need to add the `-c
+fetch.fsck.badTimezone=ignore` flag to avoid an error about a bad commit (see
+[this issue](https://github.com/psf/requests/issues/2690) for more background):
+```shell
+git clone -c fetch.fsck.badTimezone=ignore https://github.com/psf/requests.git
+```
+You can also apply this setting to your global Git config:
+```shell
+git config --global fetch.fsck.badTimezone ignore
+```
+---
+[![Kenneth Reitz](https://raw.githubusercontent.com/psf/requests/main/ext/kr.png)](https://kennethreitz.org) [![Python Software Foundation](https://raw.githubusercontent.com/psf/requests/main/ext/psf.png)](https://www.python.org/psf)

.cache/pip/http-v2/f/6/8/9/5/f689581dbe86ea3c1a14226c252116f97e87dde2c835d7d60fd42b59.body ADDED Viewed

	@@ -0,0 +1,280 @@

+Metadata-Version: 2.1
+Name: dill
+Version: 0.3.8
+Summary: serialize all of Python
+Home-page: https://github.com/uqfoundation/dill
+Author: Mike McKerns
+Author-email: [email protected]
+Maintainer: Mike McKerns
+Maintainer-email: [email protected]
+License: BSD-3-Clause
+Download-URL: https://pypi.org/project/dill/#files
+Project-URL: Documentation, http://dill.rtfd.io
+Project-URL: Source Code, https://github.com/uqfoundation/dill
+Project-URL: Bug Tracker, https://github.com/uqfoundation/dill/issues
+Platform: Linux
+Platform: Windows
+Platform: Mac
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Topic :: Scientific/Engineering
+Classifier: Topic :: Software Development
+Requires-Python: >=3.8
+Provides-Extra: graph
+Requires-Dist: objgraph (>=1.7.2) ; extra == 'graph'
+Provides-Extra: profile
+Requires-Dist: gprof2dot (>=2022.7.29) ; extra == 'profile'
+Provides-Extra: readline
+-----------------------------
+dill: serialize all of Python
+-----------------------------
+About Dill
+==========
+``dill`` extends Python's ``pickle`` module for serializing and de-serializing
+Python objects to the majority of the built-in Python types. Serialization
+is the process of converting an object to a byte stream, and the inverse
+of which is converting a byte stream back to a Python object hierarchy.
+``dill`` provides the user the same interface as the ``pickle`` module, and
+also includes some additional features. In addition to pickling Python
+objects, ``dill`` provides the ability to save the state of an interpreter
+session in a single command.  Hence, it would be feasible to save an
+interpreter session, close the interpreter, ship the pickled file to
+another computer, open a new interpreter, unpickle the session and
+thus continue from the 'saved' state of the original interpreter
+session.
+``dill`` can be used to store Python objects to a file, but the primary
+usage is to send Python objects across the network as a byte stream.
+``dill`` is quite flexible, and allows arbitrary user defined classes
+and functions to be serialized.  Thus ``dill`` is not intended to be
+secure against erroneously or maliciously constructed data. It is
+left to the user to decide whether the data they unpickle is from
+a trustworthy source.
+``dill`` is part of ``pathos``, a Python framework for heterogeneous computing.
+``dill`` is in active development, so any user feedback, bug reports, comments,
+or suggestions are highly appreciated.  A list of issues is located at
+https://github.com/uqfoundation/dill/issues, with a legacy list maintained at
+https://uqfoundation.github.io/project/pathos/query.
+Major Features
+==============
+``dill`` can pickle the following standard types:
+    - none, type, bool, int, float, complex, bytes, str,
+    - tuple, list, dict, file, buffer, builtin,
+    - Python classes, namedtuples, dataclasses, metaclasses,
+    - instances of classes,
+    - set, frozenset, array, functions, exceptions
+``dill`` can also pickle more 'exotic' standard types:
+    - functions with yields, nested functions, lambdas,
+    - cell, method, unboundmethod, module, code, methodwrapper,
+    - methoddescriptor, getsetdescriptor, memberdescriptor, wrapperdescriptor,
+    - dictproxy, slice, notimplemented, ellipsis, quit
+``dill`` cannot yet pickle these standard types:
+    - frame, generator, traceback
+``dill`` also provides the capability to:
+    - save and load Python interpreter sessions
+    - save and extract the source code from functions and classes
+    - interactively diagnose pickling errors
+Current Release
+===============
+The latest released version of ``dill`` is available from:
+    https://pypi.org/project/dill
+``dill`` is distributed under a 3-clause BSD license.
+Development Version
+===================
+You can get the latest development version with all the shiny new features at:
+    https://github.com/uqfoundation
+If you have a new contribution, please submit a pull request.
+Installation
+============
+``dill`` can be installed with ``pip``::
+    $ pip install dill
+To optionally include the ``objgraph`` diagnostic tool in the install::
+    $ pip install dill[graph]
+To optionally include the ``gprof2dot`` diagnostic tool in the install::
+    $ pip install dill[profile]
+For windows users, to optionally install session history tools::
+    $ pip install dill[readline]
+Requirements
+============
+``dill`` requires:
+    - ``python`` (or ``pypy``), **>=3.8**
+    - ``setuptools``, **>=42**
+Optional requirements:
+    - ``objgraph``, **>=1.7.2**
+    - ``gprof2dot``, **>=2022.7.29**
+    - ``pyreadline``, **>=1.7.1** (on windows)
+Basic Usage
+===========
+``dill`` is a drop-in replacement for ``pickle``. Existing code can be
+updated to allow complete pickling using::
+    >>> import dill as pickle
+or::
+    >>> from dill import dumps, loads
+``dumps`` converts the object to a unique byte string, and ``loads`` performs
+the inverse operation::
+    >>> squared = lambda x: x**2
+    >>> loads(dumps(squared))(3)
+    9
+There are a number of options to control serialization which are provided
+as keyword arguments to several ``dill`` functions:
+* with *protocol*, the pickle protocol level can be set. This uses the
+  same value as the ``pickle`` module, *DEFAULT_PROTOCOL*.
+* with *byref=True*, ``dill`` to behave a lot more like pickle with
+  certain objects (like modules) pickled by reference as opposed to
+  attempting to pickle the object itself.
+* with *recurse=True*, objects referred to in the global dictionary are
+  recursively traced and pickled, instead of the default behavior of
+  attempting to store the entire global dictionary.
+* with *fmode*, the contents of the file can be pickled along with the file
+  handle, which is useful if the object is being sent over the wire to a
+  remote system which does not have the original file on disk. Options are
+  *HANDLE_FMODE* for just the handle, *CONTENTS_FMODE* for the file content
+  and *FILE_FMODE* for content and handle.
+* with *ignore=False*, objects reconstructed with types defined in the
+  top-level script environment use the existing type in the environment
+  rather than a possibly different reconstructed type.
+The default serialization can also be set globally in *dill.settings*.
+Thus, we can modify how ``dill`` handles references to the global dictionary
+locally or globally::
+    >>> import dill.settings
+    >>> dumps(absolute) == dumps(absolute, recurse=True)
+    False
+    >>> dill.settings['recurse'] = True
+    >>> dumps(absolute) == dumps(absolute, recurse=True)
+    True
+``dill`` also includes source code inspection, as an alternate to pickling::
+    >>> import dill.source
+    >>> print(dill.source.getsource(squared))
+    squared = lambda x:x**2
+To aid in debugging pickling issues, use *dill.detect* which provides
+tools like pickle tracing::
+    >>> import dill.detect
+    >>> with dill.detect.trace():
+    >>>     dumps(squared)
+    ┬ F1: <function <lambda> at 0x7fe074f8c280>
+    ├┬ F2: <function _create_function at 0x7fe074c49c10>
+    │└ # F2 [34 B]
+    ├┬ Co: <code object <lambda> at 0x7fe07501eb30, file "<stdin>", line 1>
+    │├┬ F2: <function _create_code at 0x7fe074c49ca0>
+    ││└ # F2 [19 B]
+    │└ # Co [87 B]
+    ├┬ D1: <dict object at 0x7fe0750d4680>
+    │└ # D1 [22 B]
+    ├┬ D2: <dict object at 0x7fe074c5a1c0>
+    │└ # D2 [2 B]
+    ├┬ D2: <dict object at 0x7fe074f903c0>
+    │├┬ D2: <dict object at 0x7fe074f8ebc0>
+    ││└ # D2 [2 B]
+    │└ # D2 [23 B]
+    └ # F1 [180 B]
+With trace, we see how ``dill`` stored the lambda (``F1``) by first storing
+``_create_function``, the underlying code object (``Co``) and ``_create_code``
+(which is used to handle code objects), then we handle the reference to
+the global dict (``D2``) plus other dictionaries (``D1`` and ``D2``) that
+save the lambda object's state. A ``#`` marks when the object is actually stored.
+More Information
+================
+Probably the best way to get started is to look at the documentation at
+http://dill.rtfd.io. Also see ``dill.tests`` for a set of scripts that
+demonstrate how ``dill`` can serialize different Python objects. You can
+run the test suite with ``python -m dill.tests``. The contents of any
+pickle file can be examined with ``undill``.  As ``dill`` conforms to
+the ``pickle`` interface, the examples and documentation found at
+http://docs.python.org/library/pickle.html also apply to ``dill``
+if one will ``import dill as pickle``. The source code is also generally
+well documented, so further questions may be resolved by inspecting the
+code itself. Please feel free to submit a ticket on github, or ask a
+question on stackoverflow (**@Mike McKerns**).
+If you would like to share how you use ``dill`` in your work, please send
+an email (to **mmckerns at uqfoundation dot org**).
+Citation
+========
+If you use ``dill`` to do research that leads to publication, we ask that you
+acknowledge use of ``dill`` by citing the following in your publication::
+    M.M. McKerns, L. Strand, T. Sullivan, A. Fang, M.A.G. Aivazis,
+    "Building a framework for predictive science", Proceedings of
+    the 10th Python in Science Conference, 2011;
+    http://arxiv.org/pdf/1202.1056
+    Michael McKerns and Michael Aivazis,
+    "pathos: a framework for heterogeneous computing", 2010- ;
+    https://uqfoundation.github.io/project/pathos
+Please see https://uqfoundation.github.io/project/pathos or
+http://arxiv.org/pdf/1202.1056 for further information.

.local/share/jupyter/nbextensions/code_font_size/code_font_size.js ADDED Viewed

	@@ -0,0 +1,70 @@

+// Increase/decrease code font size
+define([
+    'base/js/namespace',
+    'base/js/events'
+    ], function(Jupyter, events) {
+            var code_change_fontsize =  function(doIncrease) {
+                var pre_css = null;
+                var pre_style = null;
+                for(i = 0; i < document.styleSheets.length; i++){
+                    //if style sheet is custom.css
+                    if(/.*\/custom\/custom\.css/.test(document.styleSheets[i].href)){
+                        //pre_css now contains the style sheet custom.css
+                        pre_css = document.styleSheets[i];
+                        break;
+                    }
+                }
+                for(i = 0; i < pre_css.cssRules.length; i++){
+                    if(/\.CodeMirror pre/.test(pre_css.cssRules[i].selectorText)){
+                        pre_style = pre_css.cssRules[i].style;
+                        break;
+                    }
+                }
+                if(pre_style == null){
+                    pre_css.insertRule(".CodeMirror pre { font-size: \"14px\"; padding-bottom: \"0px\"; }", 0);
+                    pre_style = pre_css.cssRules[0];
+                }
+                var font_size = pre_style.fontSize || "";
+                if(font_size == "")
+                    font_size = 14;
+                else
+                    font_size = +/\d+/.exec(font_size)[0];
+                font_size += (doIncrease ? +3 : -3);
+                font_size = (font_size < 8 ? 8 : font_size);
+                var padding_size = (font_size <= 14 ? 0 : (font_size - 14));
+                pre_style.paddingBottom = padding_size + "px";
+                pre_style.fontSize = font_size + "px";
+            };
+        var load_ipython_extension = function () {
+            Jupyter.toolbar.add_buttons_group([
+                /*
+                 * Buttons to increase/decrease code font size
+                 */
+                Jupyter.keyboard_manager.actions.register ({
+                     'help'   : 'Increase code font size',
+                     'icon'   : 'fa-search-plus',
+                     'handler': function () {
+                        $( document ).ready(code_change_fontsize(true));
+                     }
+                }, 'increase-code-font-size', 'code_font_size'),
+                Jupyter.keyboard_manager.actions.register ({
+                     'help'   : 'Decrease code font size',
+                     'icon'   : 'fa-search-minus',
+                     'handler': function () {
+                        $( document ).ready(code_change_fontsize(false));
+                     }
+                }, 'decrease-code-font-size', 'code_font_size'),
+            ]);
+        };
+        return {
+            load_ipython_extension : load_ipython_extension
+        };
+});

.local/share/jupyter/nbextensions/code_prettify/README_code_prettify.md ADDED Viewed

	@@ -0,0 +1,300 @@

+A Code Prettifier
+=================
+This nbextension reformats/prettifies code in notebook code cells.
+Under the hood, it uses a call to the current notebook kernel to reformat the
+code.
+Thus the actual prettifier package has to be callable from the current kernel
+language.
+With an appropriately-configured prettifier for the kernel in use, the
+nbextension provides
+- a toolbar button (configurable to be added or not)
+- a keyboard shortcut for reformatting the current code-cell (default shortcut
+  is `Ctrl-L`, can also be configured not to add the keyboard shortcut).
+- a keyboard shortcut for reformatting the whole notebook (default shortcut
+  is `Ctrl-Shift-L`, can also be configured not to add the keyboard shortcut).
+Syntax shall be correct. The nbextension may also point out basic syntax errors.
+![](demo-py.gif)
+![](demo-R.gif)
+![](demo-jv.gif)
+Compatible Kernels
+------------------
+Example implementations are provided for prettifiers for ipython, ir and
+ijavascript kernels which should work out of the box (assuming availability of
+the relevant kernel-specific [prerequisites] mentioned below), but the
+kernel-specific prettifier calls are configurable, so the model is applicable
+to essentially any kernel language and prettifier library.
+Other languages may be added as defaults in the future, but given that there
+are more than 50 [kernels] available for Jupyter, it is not easily possible to
+support all of them out of the box, unless people with experience in the
+relevant kernels have the time to contribute code. For information on how the
+reformatting takes place, and how to adapt it for your particular
+kernel/prettifier, see the [options] and [internals] sections below.
+If you implement a language that isn't yet provided by default, please submit a
+PR or let us know to add it to the repo :)
+Under the hood, this nbextension's functionality is provided by the
+[KerneExecOnCells library], a shared library for creating Jupyter nbextensions
+which transform code cell text using calls to the active kernel.
+Prerequisites
+-------------
+Of course, you must have the necessary kernel-specific packages installed for
+the prettifier call to work:
+- for the default python implementation, the [yapf] module is required:
+        pip install yapf
+  Others you might consider using include [autopep8] - see [README_autopep8.md].
+- for R, the default implementation uses the [formatR] and [jsonlite] packages:
+  ```r
+  install.packages(c("formatR", "jsonlite"), repos="http://cran.rstudio.com")
+  ```
+- for [ijavascript], the [js-beautify] package is used:
+  (*Under linux, in the root of your user tree = ~*)
+      npm install js-beautify
+  Under Windows, you may then need to set the `NODE_PATH` environment variable
+  (see [this question on stackoverflow]) to it to `%AppData%\npm\node_modules`
+  (Windows 7/8/10).
+  To be done with it once and for all, add this as a System variable in the
+  Advanced tab of the System Properties dialog.
+Options
+-------
+All options are provided by the [KerneExecOnCells library]. - see the
+[internals] section below for details.
+There are a few nbextension-wide options, configurable using the
+[jupyter_nbextensions_configurator] or by editing the `notebook` section config
+file directly.
+The options are as follows:
+- `code_prettify.add_toolbar_button`:
+  Whether to add a toolbar button to transform the selected cell(s).
+  Defaults to `true`.
+- `code_prettify.button_icon`:
+  A font-awesome class defining the icon used for the toolbar button and
+  actions. See [fontawesome] for available icon classes.
+  Defaults to `fa-legal`.
+- `code_prettify.button_label`:
+  Toolbar button label text. Also used in the actions' help text.
+  Defaults to `Code prettify`.
+- `code_prettify.register_hotkey`:
+  Whether to register hotkeys to transform the selected cell(s)/whole notebook.
+  Defaults to `true`.
+- `code_prettify.hotkeys.process_all`:
+  Hotkey to use to transform all the code cells in the notebook.
+  Defaults to `Ctrl-Shift-L`.
+- `code_prettify.hotkeys.process_selected`:
+  Hotkey to use to transform the selected cell(s).
+  Defaults to `Ctrl-L`.
+- `code_prettify.show_alerts_for_not_supported_kernel`:
+  Whether to show alerts if the kernel is not supported.
+  Defaults to `false`.
+- `code_prettify.show_alerts_for_errors`:
+  Whether to show alerts for errors in the kernel calls.
+  Defaults to `true`.
+- `code_prettify.kernel_config_map_json`:
+  The value of this key is a string which can be parsed into a json object
+  giving the config for each kernel language.
+  The following give the per-kernel options of the parsed json, using the
+  language key `python `:
+  * `code_prettify.kernel_config_map_json.python.library`:
+    String to execute in the kernel in order to load any necessary kernel
+    libraries.
+  * `code_prettify.kernel_config_map_json.python.replacements_json_to_kernel`:
+    a list of pairs of strings, used as arguments to javascript's
+    `String.replace(from, to)` to translate from a json string into a valid
+    representation of the same string in the kernel language. Since json
+    strings are particularly simple, this can often (as with the python
+    language) be left as the default, an empty list.
+  * `code_prettify.kernel_config_map_json.python.prefix` and
+    `code_prettify.kernel_config_map_json.python.postfix`:
+    Strings added as bookends to the kernel string (translated from the json
+    string using the replacements above) to make up the kernel prettifier call
+    kernel's prettifier libraries.
+  * `code_prettify.kernel_config_map_json.python.trim_formatted_text`:
+    Whether to trim whitespace from the transformed cell text. Since jupyter
+    cells don't usually have leading or trailing whitespace, the default
+    behaviour is to trim the transformed text, in order to prevent the
+    transform adding extra newlines at the end (a common behaviour for source
+    files, where having a trailing newline is often considered good practice).
+Internals
+---------
+Under the hood, this nbextension uses the [KerneExecOnCells library], a shared
+library for creating Jupyter nbextensions which transform code cell text using
+calls to the active kernel.
+The model is essentially:
+1.  The cell text is grabbed by client-side javascript, then turned into a json
+    string using javascript `JSON.stringify`. Since json-compatible strings are
+    a particularly simple string format, which is compatible with many other
+    programming languages without much modification (e.g. a valid json string
+    is also a valid string in python 3, and also in python 2 when prefixed with
+    a `u`), and easily converted for use in others (because of its simplicity).
+2.  Optional regex replacements are used to translate the json-format string
+    into a valid kernel string. Python, R and javascript don't require this
+    step, but other  languages may do, so it's implemented for flexibility
+    using the per-kernel config key `replacements_json_to_kernel`, which is a
+    list of pairs of arguments to javascript `String.replace`.
+3.  The kernel-specific prettifier call is then composed from
+    `kernel_config.prefix` + `kernel_text_string` + `kernel_config.postfix` and
+    sent to the kernel for execution. This kernel call is expected to get the
+    formatted cell text _printed_ as a json-compatible string. Since most
+    kernel languages have json packages, this should hopefully be easy to
+    arrange. The reason for the printing text rather than simply displaying it,
+    is that it prevents us having to translate from a kernel string
+    representing a json string.
+4.  The callback for the kernel execution in client-side javascript parses the
+    printed json-format string, optionally trims trailing whitespace according
+    to the `trim_formatted_text` key (which defaults to `true`) in the
+    per-kernel config, and then sets the cell text using the result.
+The process is probably best illustrated using an example for the python
+implementation:
+1.  **At nbextension load**, the `code_prettify.kernel_config_map_json` config
+    option is parsed to give the json object
+    ```json
+    {
+      "python": {
+        "library": "import json\nimport yapf.yapflib.yapf_api",
+        "prefix": "print(json.dumps(yapf.yapflib.yapf_api.FormatCode(u",
+        "postfix": ")[0]))"
+      }
+    }
+    ```
+    (other kernel languages are omitted for clarity).
+2.  **On kernel becoming ready**, the nbextension looks up the config for the
+    kernel's language (in our example, this is the `python` key of the kernel
+    config json object above). It then sends the kernel config's `library`
+    string to the kernel for execution. Thus the python implementation above
+    executes
+    ```python
+    import json
+    import yapf.yapflib.yapf_api
+    ```
+3.  **On requesting a cell be prettified** which can happen by clicking the
+    toolbar, or with a (configurable) hotkey, the following happens:
+    Say the cell to be formatted contains the following ugly python code:
+    ```python
+    msg= 'hello '+"world"
+    print  (
+                        msg    )
+    ```
+    Then the result of the `JSON.stringify` call will be a string containing
+    ```json
+    "msg= 'hello '+\"world\"\nprint  (\n                    msg    )"
+    ```
+    (note the opening and closing quotes). Concatenating this with the prefix &
+    postfix strings from the python kernel config above, gives us the kernel
+    code to execute. The call sent to the python kernel is therefore
+    ```python
+    print(json.dumps(yapf.yapflib.yapf_api.FormatCode(u"msg= 'hello '+\"world\"\nprint  (\n                    msg    )")[0]))
+    ```
+4.  What gets 'printed' by the kernel (i.e. returned to the javascript stream
+    callback) is the following json-format string:
+    ```json
+    "msg = 'hello ' + \"world\"\nprint(msg)\n"
+    ```
+    The default is to trim whitepace from the returned prettified text, which
+    results in the final prettified python code for the cell:
+    ```python
+    msg = 'hello ' + "world"
+    print(msg)
+    ```
+History
+-------
+- [@jfbercher], august 14, 2016, first version, named `yapf_ext`
+- [@jfbercher], august 19, 2016, second version `code_prettify`
+  - introduced support for R and javascript.
+  - changed extension name from `yapf_ext` to `code_prettify`
+- [@jcb91], december 2016
+  - made addition of toolbar button & hotkey configurable
+  - reworked to avoid regex replacements for conversion to/from kernel string
+    formats, in favour of json-string interchange
+  - made kernel-specific prettifier calls configurable, allowing support for
+    different prettifiers & arbitrary kernels
+  - improved documentation
+- [@jfbercher], december 2016-january 2017
+  - added a configurable shortkey to reflow the whole notebook
+  - extracted most of the code to build a general library of functions,
+    `kernel_exec_on_cell.js`, which can be used for all nbextensions which
+    needs to exec some code (via the current kernel) on the text from cells.
+[@jcb91]: https://github.com/jcb91
+[@jfbercher]: https://github.com/jfbercher
+[autopep8]: https://github.com/hhatto/autopep8
+[formatR]: https://yihui.name/formatr
+[fontawesome]: https://fontawesome.com/icons
+[ijavascript]: https://n-riesco.github.io/ijavascript
+[internals]: #Internals
+[js-beautify]: https://github.com/beautify-web/js-beautify
+[jsonlite]: https://github.com/jeroen/jsonlite
+[jupyter_nbextensions_configurator]: https://github.com/Jupyter-contrib/jupyter_nbextensions_configurator
+[KerneExecOnCells library]: README.md
+[kernels]: https://github.com/ipython/ipython/wiki/IPython-kernels-for-other-languages
+[options]: #Options
+[prerequisites]: #Prerequisites
+[README_autopep8.md]: README_autopep8.md
+[this question on stackoverflow]: https://stackoverflow.com/questions/9587665/nodejs-cannot-find-installed-module-on-windows
+[yapf]: https://github.com/google/yapf

.local/share/jupyter/nbextensions/code_prettify/demo_2to3.gif ADDED Viewed

.local/share/jupyter/nbextensions/code_prettify/isort.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+Type: Jupyter Notebook Extension
+Name: isort formatter
+Description: Sort imports in python files using isort
+Link: README_isort.md
+Main: isort.js
+Compatibility: Jupyter 4.x, 5.x
+Parameters:
+- name: isort.add_toolbar_button
+  description: Add a toolbar button to convert the selected cell(s)
+  input_type: checkbox
+  default: true
+- name: isort.button_icon
+  description: |
+    Toolbar button icon: a font-awesome class defining the icon used for the
+    toolbar button. See https://fontawesome.com/icons for available icons.
+  input_type: text
+  default: 'fa-sort'
+- name: isort.button_label
+  description: Toolbar button label text
+  input_type: text
+  default: 'Sort imports with isort'
+- name: isort.kernel_config_map_json
+  description: |
+    kernel_config_map_json:
+    json defining library calls required to load the kernel-specific
+    converting modules, and the prefix & postfix for the json-format string
+    required to make the converting call.
+  input_type: textarea
+  default: |
+    {
+      "python": {
+        "library": "import json, isort\ndef _isort_refactor_cell(src):\n    try:\n        tree = isort.SortImports(file_contents=src).output\n    except Exception:\n        return src \n    else:\n        return str(tree)[:-1]",
+        "prefix": "print(json.dumps(_isort_refactor_cell(u",
+        "postfix": ")))"
+      }
+    }

.local/share/jupyter/nbextensions/codefolding/codefolding.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+Type: Jupyter Notebook Extension
+Name: Codefolding
+Description: This extension enables the CodeMirror feature to allow codefolding in code cells
+Link: readme.md
+Icon: icon.png
+Main: main.js
+Compatibility: 4.x, 5.x
+Parameters:
+- name: codefolding_hotkey
+  description: Hotkey to fold/unfold code
+  input_type: hotkey
+  default: Alt-F
+- name: init_delay
+  description: Add a delay before initializing the extension. Useful when the gutter is not being initialized correctly.
+  input_type: number
+  min: 0
+  default: 0

.local/share/jupyter/nbextensions/codefolding/codefolding_editor.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+Type: Jupyter Notebook Extension
+Name: Codefolding in Editor
+Description: |
+  Enables the CodeMirror feature to allow codefolding in the Jupyter file
+  editor view.
+  Note that this also uses the codefolding hotkey from the codefolding
+  nbextension for the notebook view.
+Link: readme.md
+Icon: codefolding_editor.png
+Main: edit.js
+Compatibility: 4.x, 5.x
+Parameters:
+- name: init_delay
+  description: Add a delay before initializing the extension. Useful when the gutter is not being initialized correctly.
+  input_type: number
+  min: 0
+  default: 1000
+Section: edit

.local/share/jupyter/nbextensions/codefolding/codefolding_indent_folded_2.png ADDED Viewed

.local/share/jupyter/nbextensions/codefolding/firstline-fold.js ADDED Viewed

	@@ -0,0 +1,14 @@

+/* allow folding of complete cell, if comment is in first line */
+CodeMirror.registerHelper("fold", "firstline", function(cm, start) {
+  var mode = cm.getMode(), Token = mode.lineComment;
+  if (start.line == 0) {
+      var lineText = cm.getLine(start.line);
+      var found = lineText.lastIndexOf(Token,0);
+      if (found == 0) {
+        end =  cm.lastLine();
+        return {from: CodeMirror.Pos(start.line, null),
+              to: CodeMirror.Pos(end, null)};
+        }
+    }
+    return ;
+});

.local/share/jupyter/nbextensions/codefolding/magic-fold.js ADDED Viewed

	@@ -0,0 +1,14 @@

+/* allow folding of complete cell, if IPython magic symbol "%" is in first line */
+CodeMirror.registerHelper("fold", "magic", function(cm, start) {
+  var mode = cm.getMode(), Token = "%%";
+  if (start.line == 0) {
+      var lineText = cm.getLine(start.line);
+      var found = lineText.lastIndexOf(Token,0);
+      if (found == 0) {
+        end =  cm.lastLine();
+        return {from: CodeMirror.Pos(start.line, null),
+              to: CodeMirror.Pos(end, null)};
+        }
+    }
+    return ;
+});

.local/share/jupyter/nbextensions/codefolding/magic-unfolded.png ADDED Viewed

.local/share/jupyter/nbextensions/codemirror_mode_extensions/main.js ADDED Viewed

	@@ -0,0 +1,11 @@

+define(['codemirror/lib/codemirror'], function (CodeMirror) {
+    "use strict";
+    return {
+        load_ipython_extension : function () {
+            CodeMirror.extendMode('octave', {
+                lineComment: '%',
+                fold: 'indent',
+            });
+        }
+    };
+});

.local/share/jupyter/nbextensions/collapsible_headings/main.css ADDED Viewed

	@@ -0,0 +1,130 @@

+.collapsible_headings_toggle .h1 {
+  font-size: 185.7%;
+  margin: 0.538em 0 0 0;
+  line-height: 1.0;
+}
+.collapsible_headings_toggle .h2 {
+  font-size: 157.1%;
+  margin: 0.636em 0 0 0;
+  line-height: 1.0;
+}
+.collapsible_headings_toggle .h3 {
+  font-size: 128.6%;
+  margin: 0.777em 0 0 0;
+  line-height: 1.0;
+}
+.collapsible_headings_toggle .h4,
+.collapsible_headings_toggle .h5,
+.collapsible_headings_toggle .h6 {
+  font-size: 100%;
+  margin: 1em 0 0 0;
+  line-height: 1.0;
+}
+.collapsible_headings_toggle.btn .h1,
+.collapsible_headings_toggle.btn .h2,
+.collapsible_headings_toggle.btn .h3,
+.collapsible_headings_toggle.btn .h4,
+.collapsible_headings_toggle.btn .h5,
+.collapsible_headings_toggle.btn .h6 {
+  margin-top: 0;
+}
+.collapsible_headings_toggle .fa {
+	transition: transform 400ms;
+  /* don't support IE filter, since can't rotate 360 */
+	-webkit-transform: rotate(360deg);
+	-moz-transform: rotate(360deg);
+	-ms-transform: rotate(360deg);
+	-o-transform: rotate(360deg);
+	transform: rotate(360deg);
+}
+.collapsible_headings_collapsed .fa {
+	-webkit-transform: none;
+	-moz-transform: none;
+	-ms-transform: none;
+	-o-transform: none;
+	transform: none;
+}
+/* bracket rules */
+div.cell {
+  position: relative;
+}
+.chb {
+  position: absolute;
+  top: -1px;
+  bottom: -1px;
+  left: calc(100% + 3px);
+  display: flex;
+  flex-direction: row-reverse;
+  justify-content: flex-start;
+  align-items: stretch;
+}
+.chb div {
+  margin-left: 2px;
+  width: 5px;
+  border-color: #aaa;
+  border-left-color: transparent;
+  border-style: solid;
+  border-width: 0 2px 0 2px;
+}
+.collapsible_headings_collapsed .chb .chb-start {
+  border-width: 5px 2px 2px 4px;
+}
+.chb div:hover,
+.chb .chb-hover,
+.jupyter-soft-selected .chb div{
+    border-color:  #42A5F5;
+    border-left-color: transparent;
+	  border-width: 0 3px 0 0;
+}
+.chb .chb-start {
+  border-top-width: 1px;
+  margin-top: 2px;
+}
+.chb .chb-end {
+  border-bottom-width: 1px;
+  margin-bottom: 2px;
+}
+.chb-start div:hover, .chb .chb-start.chb-hover, .jupyter-soft-selected .chb .chb-start {
+  border-top-width: 2px;
+}
+.chb-end div:hover, .chb .chb-end.chb-hover, .jupyter-soft-selected .chb .chb-end {
+  border-bottom-width: 2px;
+}
+/* ellipsis rules */
+.collapsible_headings_ellipsis .rendered_html h1,
+.collapsible_headings_ellipsis .rendered_html h2,
+.collapsible_headings_ellipsis .rendered_html h3,
+.collapsible_headings_ellipsis .rendered_html h4,
+.collapsible_headings_ellipsis .rendered_html h5,
+.collapsible_headings_ellipsis .rendered_html h6 {
+  position: relative;
+  padding-right: 2em;
+}
+.collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h1:after,
+.collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h2:after,
+.collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h3:after,
+.collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h4:after,
+.collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h5:after,
+.collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h6:after {
+  position: absolute;
+  right: 0;
+  bottom: 0;
+  content: "[\002026]";
+  color: #aaa;
+}

.local/share/jupyter/nbextensions/collapsible_headings/screenshot.png ADDED Viewed

.local/share/jupyter/nbextensions/comment-uncomment/main.js ADDED Viewed

	@@ -0,0 +1,63 @@

+// add new configurable hotkey binding to toggle comments
+define([
+    'base/js/namespace',
+], function(
+    IPython
+) {
+    "use strict";
+    // define default config parameter values
+    var params = {
+        comment_uncomment_keybinding : 'alt-c',
+        comment_uncomment_indent: false,
+    };
+    // updates default params with any specified in the server's config
+    var update_params = function() {
+        var config = IPython.notebook.config;
+        for (var key in params){
+            if (config.data.hasOwnProperty(key) ){
+                params[key] = config.data[key];
+            }
+        }
+    };
+    var initialize = function () {
+        // update defaults
+        update_params();
+        // register actions with ActionHandler instance
+        var prefix = 'auto';
+        var name = 'toggle-comment';
+        var action = {
+            icon: 'fa-comment-o',
+            help    : 'Toggle comments',
+            help_index : 'eb',
+            id : 'read_only_codecell',
+            handler : toggle_comment
+        };
+        var action_full_name = IPython.keyboard_manager.actions.register(action, name, prefix);
+        // define keyboard shortcuts
+        var edit_mode_shortcuts = {};
+        edit_mode_shortcuts[params.comment_uncomment_keybinding] = action_full_name;
+        // register keyboard shortcuts with keyboard_manager
+        IPython.notebook.keyboard_manager.edit_shortcuts.add_shortcuts(edit_mode_shortcuts);
+    };
+    var toggle_comment = function() {
+        var cm = IPython.notebook.get_selected_cell().code_mirror;
+        cm.toggleComment({ indent: params.comment_uncomment_indent });
+        return false;
+    };
+    var load_ipython_extension = function () {
+        return IPython.notebook.config.loaded.then(initialize);
+    };
+    return {
+        load_ipython_extension : load_ipython_extension
+    };
+});

.local/share/jupyter/nbextensions/datestamper/readme.md ADDED Viewed

	@@ -0,0 +1,6 @@

+Datestamper
+===========
+Adds a toolbar button which pastes the current time & date into the current cell:
+![](icon.png)

.local/share/jupyter/nbextensions/equation-numbering/button.png ADDED Viewed

.local/share/jupyter/nbextensions/execute_time/execution-timings-menu.png ADDED Viewed

.local/share/jupyter/nbextensions/exercise/exercise.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+Type: Jupyter Notebook Extension
+Name: Exercise
+Description: |
+  Define a group of cells as an "exercise".
+  The first cell is the question,
+  while the rest of the group from the answer or solution.
+  The solution can be hidden/shown by clicking on a widget added to the
+  question cell.
+Link: readme.md
+Icon: icon.png
+Main: main.js
+Compatibility: 4.x, 5.x
+Parameters:
+- name: add_button
+  description: Add a toolbar button to create/remove an exercise
+  input_type: checkbox
+  default: true
+- name: use_hotkey
+  description: Add a keyboard shortcut to create/remove an exercise
+  input_type: checkbox
+  default: true
+- name: hotkey
+  description: Keyboard shortcut optionally used to create/remove an exercise
+  input_type: hotkey
+  default: 'Alt-D'

.local/share/jupyter/nbextensions/exercise/main.css ADDED Viewed

	@@ -0,0 +1,13 @@

+.highlight-mask
+{
+    background: transparent url('../images/theme/transBlack75.png') repeat 0 0;
+    display: none;
+    position: absolute;
+}
+.highlight-drag
+{
+    background-color: transparent;
+    border: dashed #ff3333 3px;
+    position: absolute;
+    display: none;
+}

.local/share/jupyter/nbextensions/exercise/readme.md ADDED Viewed

	@@ -0,0 +1,52 @@

+Exercise
+========
+These are two extensions for Jupyter, for hiding/showing solutions cells.
+They use the same approach and codebase and differ only by the type of
+`cell widget` used the show/hide the solutions. The two extensions can be used
+simultaneously. They require the `rubberband` extension to be installed and
+enabled.
+The example below demonstrates some of the features of the exercise extensions.
+- First, an solution or "details" cell is created by (a) selecting two cells with the rubberband and (b) clicking on the menu-button [exercise extension]
+- Second, the two next cells are selected using a keyboard shortcut, and a solution is created using the shortcut Alt-D [exercise2 extension]
+- Third, the two solutions are expanded by clicking on the corresponding widgets
+- Fourth, the solutions are removed by selecting them and clicking on the buttons in the toolbar.
+![](image.gif)
+The extensions provide
+----------------------
+- a menubar button
+- a cell widget -- A plus/minus button in `exercise` and a sliding checkbox in `exercise2`.
+The menubar button is devoted to the creation or removing of the solution. The solution consists in several consecutive cells that can be selected by the usual notebook multicell selection methods (e.g. *Shift-down* (select next) or *Shift-up* (select previous) keyboard shortcuts, or using the rubberband extension.
+### Creating a solution
+Several cells being selected, pressing the menubar button adds a `cell widget` and hides the cells excepted the first one which serves as a heading cell. *Do not forget to keep the Shift key pressed down while clicking on the menu button
+(otherwise selected cells will be lost)*. It is also possible to use a keyboard shortcut for creating the solution from selected cells: Alt-S for exercise extension and Alt-D for exercise2.
+### Removing a solution
+If a solution heading (first) cell is selected, then clicking the menu bar button removes this solution and its solutions cells are shown. Using the keyboard shortcut has the same effect.
+### Showing/hiding solution
+At creation of the solution, the solution cells are hidden. Clicking the `cell widget` toggles the hidden/shown state of the solution.
+### Persistence
+The state of solutions, hidden or shown, is preserved and automatically restored at startup and on reload.
+### Internals
+exercise and exercise2 add respectively a solution and solution2 metadata to solution cells, with for value the current state hidden/shown of the solution. For exercise, a div with the plus/minus character is prepended to the solution heading cell. For exercise2, a flex-wrap style is added to the solution heading cell and a checkbox widget, with some css styling, is appended to the cell. A solution[.2]_first metadada is also added to enable an easy detection of the first cell in an "exercise" and then allow several consecutive exercises.

.local/share/jupyter/nbextensions/help_panel/help_panel_ext_fullscreen.png ADDED Viewed

.local/share/jupyter/nbextensions/help_panel/readme.md ADDED Viewed

	@@ -0,0 +1,15 @@

+Help Panel
+===========
+Installing the extension adds a new button to the toolbar:
+![](icon.png)
+On clicking the button, the notebook width is reduced and a side panel is displayed showing help.
+The contents of the help panel are exactly the same as when going to `Keyboard Shortcuts` in the `Help` menu.
+![](help_panel_ext.png)
+You can drag the sidebar divider to resize it, or click the expand icon at the top left of the bar to get the help panel to expand to fill the screen:
+![](help_panel_ext_fullscreen.png)

.local/share/jupyter/nbextensions/hide_input/hide-input.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+Type: IPython Notebook Extension
+Compatibility: 3.x, 4.x, 5.x
+Main: main.js
+Name: Hide input
+Icon: icon.png
+Description: "toggle display of selected code cell's input"
+Link: readme.md

.local/share/jupyter/nbextensions/hide_input/main.js ADDED Viewed

	@@ -0,0 +1,54 @@

+// Adds a button to hide the input part of the currently selected cells
+define([
+    'jquery',
+    'base/js/namespace',
+    'base/js/events'
+], function(
+    $,
+    Jupyter,
+    events
+) {
+    "use strict";
+    var toggle_selected_input = function () {
+        // Find the selected cell
+        var cell = Jupyter.notebook.get_selected_cell();
+        // Toggle visibility of the input div
+        cell.element.find("div.input").toggle('slow');
+        cell.metadata.hide_input = ! cell.metadata.hide_input;
+    };
+    var update_input_visibility = function () {
+        Jupyter.notebook.get_cells().forEach(function(cell) {
+            if (cell.metadata.hide_input) {
+                cell.element.find("div.input").hide();
+            }
+        })
+    };
+    var load_ipython_extension = function() {
+        // Add a button to the toolbar
+        $(Jupyter.toolbar.add_buttons_group([
+            Jupyter.keyboard_manager.actions.register({
+                help   : 'Toggle selected cell input display',
+                icon   : 'fa-chevron-up',
+                handler: function() {
+                    toggle_selected_input();
+                    setTimeout(function() { $('#btn-hide-input').blur(); }, 500);
+                }
+            }, 'toggle-cell-input-display', 'hide_input')
+        ])).find('.btn').attr('id', 'btn-hide-input');
+        // Collapse all cells that are marked as hidden
+        if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
+            // notebook already loaded. Update directly
+            update_input_visibility();
+        }
+        events.on("notebook_loaded.Notebook", update_input_visibility);
+    };
+    return {
+        load_ipython_extension : load_ipython_extension
+    };
+});

.local/share/jupyter/nbextensions/hide_input_all/hide_input_all_hide.png ADDED Viewed

.local/share/jupyter/nbextensions/hide_input_all/icon.png ADDED Viewed

.local/share/jupyter/nbextensions/hide_input_all/main.js ADDED Viewed

	@@ -0,0 +1,59 @@

+// toggle display of all code cells' inputs
+define([
+    'jquery',
+    'base/js/namespace',
+    'base/js/events'
+], function(
+    $,
+    Jupyter,
+    events
+) {
+    "use strict";
+    function set_input_visible(show) {
+        Jupyter.notebook.metadata.hide_input = !show;
+        if (show) $('div.input').show('slow');
+        else $('div.input').hide('slow');
+        var btn = $('#toggle_codecells');
+        btn.toggleClass('active', !show);
+        var icon = btn.find('i');
+        icon.toggleClass('fa-eye', show);
+        icon.toggleClass('fa-eye-slash', !show);
+        $('#toggle_codecells').attr(
+            'title', (show ? 'Hide' : 'Show') + ' codecell inputs');
+    }
+    function toggle() {
+        set_input_visible($('#toggle_codecells').hasClass('active'));
+    }
+    function initialize () {
+        set_input_visible(Jupyter.notebook.metadata.hide_input !== true);
+    }
+    var load_ipython_extension = function() {
+        $(Jupyter.toolbar.add_buttons_group([
+            Jupyter.keyboard_manager.actions.register({
+                help   : 'Hide codecell inputs',
+                icon   : 'fa-eye',
+                handler: function() {
+                    toggle();
+                    setTimeout(function() { $('#toggle_codecells').blur(); }, 500);
+                }
+            }, 'hide-codecell-inputs', 'hide_input_all'),
+        ])).find('.btn').attr('id', 'toggle_codecells');
+        if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
+            // notebook_loaded.Notebook event has already happened
+            initialize();
+        }
+        events.on('notebook_loaded.Notebook', initialize);
+    };
+    return {
+        load_ipython_extension : load_ipython_extension
+    };
+});

.local/share/jupyter/nbextensions/highlight_selected_word/README.md ADDED Viewed

	@@ -0,0 +1,117 @@

+Highlight selected word
+=======================
+[![Join the chat at https://gitter.im/jcb91/jupyter_highlight_selected_word](https://badges.gitter.im/jcb91/jupyter_highlight_selected_word.svg)](https://gitter.im/jcb91/jupyter_highlight_selected_word?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![GitHub issues](https://img.shields.io/github/issues/jcb91/jupyter_highlight_selected_word.svg?maxAge=3600)](https://github.com/jcb91/jupyter_highlight_selected_word/issues)
+This nbextension highlights all instances of the selected word in either the
+current cell's editor, or in all cells in the notebook.
+It is based on the CodeMirror addon
+[Match Highlighter](https://codemirror.net/demo/matchhighlighter.html),
+but now uses its own codebase in order to permit matching across multiple
+editors.
+There are a few configurable [options](#Options), all of which sit under the
+config key `highlight_selected_word` in the `notebook` config section.
+Options
+-------
+Options are stored in the notebook section of the nbconfig.
+The easiest way to configure these is using the
+[jupyter_nbextensions_configurator](https://github.com/Jupyter-contrib/jupyter_nbextensions_configurator)
+serverextension, but you can also configure them directly with a few lines of
+python.
+The available options are:
+* `highlight_selected_word.highlight_across_all_cells` - if `true`, (default)
+  highlight matches across all cells. If `false`, only matches within the
+  currently selected cell will be highlighted.
+* `highlight_selected_word.code_cells_only` - Only apply highlights to editors
+  for Code cells, not, for example, Markdown or Raw cells
+* `highlight_selected_word.highlight_color` - Color used to highlight matching
+  words in the focused (active) cell
+* `highlight_selected_word.highlight_color_blurred` - Color used to highlight
+  matching words in blurred (non-active) cells
+* `highlight_selected_word.outlines_only` - Highlight words using just an
+  outline, rather than the background color. In contrast to the default
+  background-color highlight, the outline-only is also applied to the
+  currently-selected word
+* `highlight_selected_word.outline_width` - Width, in pixels, of the outline
+  used to highlight words when the outline-only setting (above) is selected.
+  Defaults to 1.
+* `highlight_selected_word.delay` - Wait time (in milliseconds) before
+  highlighting the matches
+* `highlight_selected_word.words_only` - If true, only highlight matches if the
+  selected text is a word
+* `highlight_selected_word.highlight_only_whole_words` - Only highlight matches
+  which are surrounded by non-word characters. This will use the token
+  `highlight_selected_word.show_token` to identify word characters, if it's
+  set, otherwise the regular expression `[\w$]` will be used.
+* `highlight_selected_word.show_token` - Token (regex) to identify word
+  characters, used to determine what to highlight when nothing is selected.
+  If blank, nothing is highlighted when nothing is selected.
+  This regex is also used to determine word boundaries for
+  `highlight_selected_word.highlight_only_whole_words`.
+* `highlight_selected_word.min_chars` - Minimum number of characters that must
+  be selected for the highlighting behavior to occur
+* `highlight_selected_word.use_toggle_hotkey` - Bind the
+  `highlight_selected_word.toggle` action to a hotkey. Defaults to `false`.
+* `highlight_selected_word.toggle_hotkey` - Which hotkey to bind to the
+  `highlight_selected_word.toggle` action (if set to use, see item above).
+  Defaults to `alt-h`
+* `highlight_selected_word.only_cells_in_scroll` - Only apply highlights to
+  editors which are visible in the scrolled view. This may offer performance
+  benefits for larger notebooks, but may be annoying if you're doing a lot of
+  scrolling :/
+* `highlight_selected_word.scroll_min_delay` - Minimum delay in ms between
+  updating highlights on scrolling the notebook (used only if
+  `highlight_selected_word.only_cells_in_scroll` is `true`).
+  If set to zero, no update is done on scroll.
+* `highlight_selected_word.hide_selections_in_unfocussed` - Hide any text
+  selection in non-focussed cells. Otherwise, each cell can show a text
+  selection even when its editor is not focussed, which can be confused with
+  match highlights.
+For example, to set the delay to half a second, and limit highlighting to code
+cells, we can use the following python snippet:
+```python
+from notebook.services.config import ConfigManager
+cm = ConfigManager()
+cm.update('notebook', {'highlight_selected_word': {
+    'delay': 500,
+    'code_cells_only': True,
+}})
+```
+Feedback
+--------
+If you have any feedback, or have any problems, please let me know by
+[opening an issue](https://github.com/jcb91/jupyter_highlight_selected_word/issues/new)
+at the project's
+[github repository](https://github.com/jcb91/jupyter_highlight_selected_word).
+Thanks!
+Josh.

.local/share/jupyter/nbextensions/highlight_selected_word/configurator.yaml ADDED Viewed

	@@ -0,0 +1,131 @@

+Type: Jupyter Notebook Extension
+Compatibility: 4.x, 5.x
+Name: Highlight selected word
+Main: main.js
+Description: Enables the CodeMirror addon "Match Highlighter"
+Link: README.md
+Parameters:
+- name: highlight_selected_word.enable_on_load
+  input_type: checkbox
+  default: true
+  description: |
+    Enable highlighting on loading the notebook interface.
+    The highlighting can also be toggled from the view menu
+- name: highlight_selected_word.highlight_across_all_cells
+  input_type: checkbox
+  default: true
+  description: |
+    Highlight matches across all cells. If false, only matches within the
+    currently selected cell will be highlighted.
+- name: highlight_selected_word.code_cells_only
+  input_type: checkbox
+  default: false
+  description: |
+    Only apply highlights to editors for Code cells, not, for example, Markdown
+    or Raw cells
+- name: highlight_selected_word.highlight_color
+  input_type: color
+  default: '#90EE90'
+  description: Color used to highlight matching words in the focussed cell
+- name: highlight_selected_word.highlight_color_blurred
+  input_type: color
+  default: '#BBFFBB'
+  description: Color used to highlight matching words in blurred (non-active) cells
+- name: highlight_selected_word.outlines_only
+  input_type: checkbox
+  default: false
+  description: |
+    Highlight words using just an outline, rather than the background color
+- name: highlight_selected_word.outline_width
+  input_type: number
+  default: 1
+  min: 0.5
+  step: 0.5
+  description: |
+    Width, in pixels, of the outline used to highlight words when the
+    outline-only setting is selected.
+- name: highlight_selected_word.delay
+  input_type: number
+  default: 100
+  min: 0
+  step: 1
+  description: 'Wait time, in milliseconds, before highlighting the matches'
+- name: highlight_selected_word.words_only
+  input_type: checkbox
+  default: false
+  description: Only highlight matches if the selected text is a whole word
+- name: highlight_selected_word.highlight_only_whole_words
+  input_type: checkbox
+  default: true
+  description: |
+    Only highlight matches when they are surrounded by non-word characters, as
+    determined by the token below (if set), or the default regex '[\w$]'.
+- name: highlight_selected_word.show_token
+  input_type: text
+  default: '[\w$]'  # single-quote strings in yaml are like python raw strings
+  description: |
+    Token (regex) to identify word characters, used to determine what to
+    highlight when nothing is selected. If blank, nothing is highlighted when
+    nothing is selected.
+- name: highlight_selected_word.min_chars
+  input_type: number
+  default: 2
+  min: 0
+  step: 1
+  description: |
+    Minimum number of characters that must be selected for the highlighting
+    to occur (assuming no token is set for use when nothing is selected)
+- name: highlight_selected_word.trim
+  input_type: checkbox
+  default: true
+  description: |
+    Trim whitespace from selection text before checking for minimum length
+- name: highlight_selected_word.use_toggle_hotkey
+  input_type: checkbox
+  default: false
+  description: |
+    Bind the highlight_selected_word:toggle action to a hotkey
+- name: highlight_selected_word.toggle_hotkey
+  input_type: hotkey
+  default: 'alt-h'
+  description: |
+    Hotkey to bind to the highlight_selected_word:toggle action (if selected
+    for use, above)
+- name: highlight_selected_word.only_cells_in_scroll
+  input_type: checkbox
+  default: true
+  description: |
+    Only apply highlights to editors which are visible in the scrolled view.
+    This may offer performance benefits for larger notebooks
+- name: highlight_selected_word.scroll_min_delay
+  input_type: number
+  default: 100
+  min: 0
+  step: 10
+  description: |
+    Minimum delay in ms between updating highlights on scrolling the notebook
+    (used only if limiting highlights to those in scrolled view, see above).
+    If set to zero, no update is done on scroll.
+- name: highlight_selected_word.hide_selections_in_unfocussed
+  input_type: checkbox
+  default: false
+  description: |
+    Hide any text selection in non-focussed cells (can be confused with match highlights).

.local/share/jupyter/nbextensions/highlighter/demo_highlighter.ipynb ADDED Viewed

	@@ -0,0 +1,96 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## The highlighter extension:\n",
+    "\n",
+    "- Firstable, the extension provides <span class=\"mark\">several toolbar buttons</span> for highlighting a selected text _within a markdown cell_. Three different \\`color schemes' are provided, which can be easily customized in the \\textit{stylesheet} `highlighter.css`. The last button enables to remove all highlightings in the current cell. \n",
+    "- This works both <span class=\"burk\">when the cell is _rendered_ and when the cell is in edit mode</span>; \n",
+    "- In both modes, it is possible to highlight formatted portions of text (In rendered mode, since the selected text loose its formatting, an heuristic is applied to find the best alignment with the actual text)\n",
+    "- When no text is selected, the whole cell is highlighted; \n",
+    "- The extension also provides two keyboard shortcuts (Alt-G and Alt-H) which fire the highlighting of the selected text. \n",
+    "- Highlights can be preserved when exporting to html or to LaTeX -- details are provided in [export_highlights](export_highlights.ipynb)\n",
+    "\n",
+    "\n",
+    "![](image.gif)\n",
+    "\n",
+    "## Installation:\n",
+    "\n",
+    "The extension can be installed with the nice UI available on jupyter_contrib_nbextensions website, which also allows to enable/disable the extension. \n",
+    "\n",
+    "You may also install the extension from the original repo: issue\n",
+    "```bash\n",
+    "jupyter nbextension install https://rawgit.com/jfbercher/small_nbextensions/master/highlighter.zip  --user\n",
+    "\n",
+    "```\n",
+    "at the command line.\n",
+    "\n",
+    "### Testing: \n",
+    "\n",
+    "Use a code cell with\n",
+    "```javascript\n",
+    "%%javascript\n",
+    "require(\"base/js/utils\").load_extensions(\"highlighter/highlighter\")\n",
+    "```\n",
+    "\n",
+    "### Automatic load\n",
+    "You may also automatically load the extension for any notebook via\n",
+    "```bash\n",
+    "jupyter nbextension enable highlighter/highlighter\t\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/javascript": [
+       "require(\"base/js/utils\").load_extensions(\"highlighter/highlighter\")"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Javascript object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%%javascript\n",
+    "require(\"base/js/utils\").load_extensions(\"highlighter/highlighter\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "interactive_sols": {
+   "cbx_id": 1
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3+"
+  },
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

.local/share/jupyter/nbextensions/keyboard_shortcut_editor/icon.png ADDED Viewed

.local/share/jupyter/nbextensions/keyboard_shortcut_editor/readme_undefined_key.png ADDED Viewed

.local/share/jupyter/nbextensions/load_tex_macros/main.js ADDED Viewed

	@@ -0,0 +1,39 @@

+define(function(require, exports, module) {
+    var Jupyter = require('base/js/namespace');
+    function loadLatexUserDefs() {
+        $.get('latexdefs.tex').done(function(data) {
+            data = data.replace(/^/gm, '\$\$\$').replace(/$/gm, '\$\$\$');
+            if ($('#latexdefs').length > 0) $('#latexdefs').remove();
+            $('body').append($('<div/>').attr('id', 'latexdefs').text(data));
+            console.log('latex_envs: loaded user LaTeX definitions latexdefs.tex');
+        }).fail(function() {
+            console.log('load_tex_macros: failed to load user LaTeX definitions latexdefs.tex')
+        });
+    }
+    function rerenderMaths() { // probably something like that
+            MathJax.Hub.Queue(
+              ["resetEquationNumbers",MathJax.InputJax.TeX],
+              ["PreProcess", MathJax.Hub],
+              ["Reprocess", MathJax.Hub]
+            );
+    }
+    function load_ipython_extension() {
+         "use strict";
+        if (Jupyter.notebook._fully_loaded) {
+            loadLatexUserDefs();
+            rerenderMaths();
+        } else {
+            $([Jupyter.events]).on("notebook_loaded.Notebook", function() {
+                      loadLatexUserDefs();
+                      rerenderMaths();
+            })
+        }
+    }
+    return {
+            load_ipython_extension: load_ipython_extension,
+    };
+})

.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin ADDED Viewed

Binary file (13.3 kB). View file

.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx ADDED Viewed

	@@ -0,0 +1,651 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3de4e
+.extern .shared .align 1 .b8 global_smem[];
+.visible .entry triton__0d1d2d3de4e(
+	.param .u64 triton__0d1d2d3de4e_param_0,
+	.param .u64 triton__0d1d2d3de4e_param_1,
+	.param .u64 triton__0d1d2d3de4e_param_2,
+	.param .u32 triton__0d1d2d3de4e_param_3,
+	.param .u32 triton__0d1d2d3de4e_param_4
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<15>;
+	.reg .b32 	%r<91>;
+	.reg .f32 	%f<62>;
+	.reg .b64 	%rd<16>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd5, [triton__0d1d2d3de4e_param_0];
+	ld.param.u64 	%rd6, [triton__0d1d2d3de4e_param_1];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r24, %tid.x;
+	and.b32  	%r25, %r24, 31;
+	ld.param.u64 	%rd7, [triton__0d1d2d3de4e_param_2];
+	and.b32  	%r26, %r24, 3;
+	.loc	1 24 33
+	bfe.u32 	%r27, %r24, 5, 2;
+	and.b32  	%r28, %r24, 127;
+	.loc	1 21 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r29, %r1, 2;
+	.loc	1 22 23
+	or.b32  	%r30, %r29, %r26;
+	.loc	1 29 25
+	setp.lt.u32 	%p1, %r28, 120;
+	.loc	1 31 47
+	shl.b32 	%r31, %r28, 17;
+	.loc	1 31 40
+	add.s32 	%r32, %r29, %r31;
+	.loc	1 31 34
+	mul.wide.s32 	%rd8, %r32, 4;
+	add.s64 	%rd1, %rd5, %rd8;
+	mov.b32 	%r6, 0;
+	.loc	1 31 53
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r2;
+	mov.b32 	%f2, %r3;
+	mov.b32 	%f3, %r4;
+	mov.b32 	%f4, %r5;
+	.loc	1 33 23
+	add.f32 	%f5, %f1, 0f00000000;
+	add.f32 	%f6, %f2, 0f00000000;
+	add.f32 	%f7, %f3, 0f00000000;
+	add.f32 	%f8, %f4, 0f00000000;
+	.loc	1 34 38
+	selp.f32 	%f9, %f5, 0f00000000, %p1;
+	selp.f32 	%f10, %f6, 0f00000000, %p1;
+	selp.f32 	%f11, %f7, 0f00000000, %p1;
+	selp.f32 	%f12, %f8, 0f00000000, %p1;
+$L__tmp1:
+	.loc	2 243 36
+	mov.b32 	%r33, %f9;
+	shfl.sync.bfly.b32	%r34, %r33, 16, 31, -1;
+	mov.b32 	%f13, %r34;
+$L__tmp2:
+	.loc	2 233 15
+	add.f32 	%f14, %f9, %f13;
+$L__tmp3:
+	.loc	2 243 36
+	mov.b32 	%r35, %f14;
+	shfl.sync.bfly.b32	%r36, %r35, 8, 31, -1;
+	mov.b32 	%f15, %r36;
+$L__tmp4:
+	.loc	2 233 15
+	add.f32 	%f16, %f14, %f15;
+$L__tmp5:
+	.loc	2 243 36
+	mov.b32 	%r37, %f16;
+	shfl.sync.bfly.b32	%r38, %r37, 4, 31, -1;
+	mov.b32 	%f17, %r38;
+$L__tmp6:
+	.loc	2 233 15
+	add.f32 	%f18, %f16, %f17;
+$L__tmp7:
+	.loc	2 243 36
+	mov.b32 	%r39, %f18;
+	shfl.sync.bfly.b32	%r40, %r39, 2, 31, -1;
+	mov.b32 	%f19, %r40;
+$L__tmp8:
+	.loc	2 233 15
+	add.f32 	%f20, %f18, %f19;
+$L__tmp9:
+	.loc	2 243 36
+	mov.b32 	%r41, %f20;
+	shfl.sync.bfly.b32	%r42, %r41, 1, 31, -1;
+	mov.b32 	%f21, %r42;
+$L__tmp10:
+	.loc	2 233 15
+	add.f32 	%f22, %f20, %f21;
+$L__tmp11:
+	.loc	2 243 36
+	mov.b32 	%r43, %f10;
+	shfl.sync.bfly.b32	%r44, %r43, 16, 31, -1;
+	mov.b32 	%f23, %r44;
+$L__tmp12:
+	.loc	2 233 15
+	add.f32 	%f24, %f10, %f23;
+$L__tmp13:
+	.loc	2 243 36
+	mov.b32 	%r45, %f24;
+	shfl.sync.bfly.b32	%r46, %r45, 8, 31, -1;
+	mov.b32 	%f25, %r46;
+$L__tmp14:
+	.loc	2 233 15
+	add.f32 	%f26, %f24, %f25;
+$L__tmp15:
+	.loc	2 243 36
+	mov.b32 	%r47, %f26;
+	shfl.sync.bfly.b32	%r48, %r47, 4, 31, -1;
+	mov.b32 	%f27, %r48;
+$L__tmp16:
+	.loc	2 233 15
+	add.f32 	%f28, %f26, %f27;
+$L__tmp17:
+	.loc	2 243 36
+	mov.b32 	%r49, %f28;
+	shfl.sync.bfly.b32	%r50, %r49, 2, 31, -1;
+	mov.b32 	%f29, %r50;
+$L__tmp18:
+	.loc	2 233 15
+	add.f32 	%f30, %f28, %f29;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r51, %f30;
+	shfl.sync.bfly.b32	%r52, %r51, 1, 31, -1;
+	mov.b32 	%f31, %r52;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f32, %f30, %f31;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r53, %f11;
+	shfl.sync.bfly.b32	%r54, %r53, 16, 31, -1;
+	mov.b32 	%f33, %r54;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f34, %f11, %f33;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r55, %f34;
+	shfl.sync.bfly.b32	%r56, %r55, 8, 31, -1;
+	mov.b32 	%f35, %r56;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f36, %f34, %f35;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r57, %f36;
+	shfl.sync.bfly.b32	%r58, %r57, 4, 31, -1;
+	mov.b32 	%f37, %r58;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f38, %f36, %f37;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r59, %f38;
+	shfl.sync.bfly.b32	%r60, %r59, 2, 31, -1;
+	mov.b32 	%f39, %r60;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f40, %f38, %f39;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r61, %f40;
+	shfl.sync.bfly.b32	%r62, %r61, 1, 31, -1;
+	mov.b32 	%f41, %r62;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f42, %f40, %f41;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r63, %f12;
+	shfl.sync.bfly.b32	%r64, %r63, 16, 31, -1;
+	mov.b32 	%f43, %r64;
+$L__tmp32:
+	.loc	2 233 15
+	add.f32 	%f44, %f12, %f43;
+$L__tmp33:
+	.loc	2 243 36
+	mov.b32 	%r65, %f44;
+	shfl.sync.bfly.b32	%r66, %r65, 8, 31, -1;
+	mov.b32 	%f45, %r66;
+$L__tmp34:
+	.loc	2 233 15
+	add.f32 	%f46, %f44, %f45;
+$L__tmp35:
+	.loc	2 243 36
+	mov.b32 	%r67, %f46;
+	shfl.sync.bfly.b32	%r68, %r67, 4, 31, -1;
+	mov.b32 	%f47, %r68;
+$L__tmp36:
+	.loc	2 233 15
+	add.f32 	%f48, %f46, %f47;
+$L__tmp37:
+	.loc	2 243 36
+	mov.b32 	%r69, %f48;
+	shfl.sync.bfly.b32	%r70, %r69, 2, 31, -1;
+	mov.b32 	%f49, %r70;
+$L__tmp38:
+	.loc	2 233 15
+	add.f32 	%f50, %f48, %f49;
+$L__tmp39:
+	.loc	2 243 36
+	mov.b32 	%r71, %f50;
+	shfl.sync.bfly.b32	%r72, %r71, 1, 31, -1;
+	mov.b32 	%f51, %r72;
+$L__tmp40:
+	.loc	2 233 15
+	add.f32 	%f52, %f50, %f51;
+$L__tmp41:
+	.loc	2 243 36
+	setp.eq.s32 	%p6, %r25, 0;
+	shl.b32 	%r73, %r27, 2;
+	mov.u32 	%r74, global_smem;
+	add.s32 	%r10, %r74, %r73;
+	mov.b32 	%r11, %f22;
+	@%p6 st.shared.b32 [ %r10 + 0 ], %r11;
+	add.s32 	%r12, %r10, 16;
+	mov.b32 	%r13, %f32;
+	@%p6 st.shared.b32 [ %r12 + 0 ], %r13;
+	add.s32 	%r14, %r10, 32;
+	mov.b32 	%r15, %f42;
+	@%p6 st.shared.b32 [ %r14 + 0 ], %r15;
+	add.s32 	%r16, %r10, 48;
+	mov.b32 	%r17, %f52;
+	@%p6 st.shared.b32 [ %r16 + 0 ], %r17;
+	bar.sync 	0;
+	setp.lt.s32 	%p10, %r24, 16;
+	shl.b32 	%r75, %r24, 2;
+	add.s32 	%r19, %r74, %r75;
+	@%p10 ld.shared.b32 %r18, [ %r19 + 0 ];
+	mov.b32 	%f53, %r18;
+	shfl.sync.bfly.b32	%r76, %r18, 2, 31, -1;
+	mov.b32 	%f54, %r76;
+$L__tmp42:
+	.loc	2 233 15
+	add.f32 	%f55, %f53, %f54;
+$L__tmp43:
+	.loc	2 243 36
+	mov.b32 	%r77, %f55;
+	shfl.sync.bfly.b32	%r78, %r77, 1, 31, -1;
+	mov.b32 	%f56, %r78;
+$L__tmp44:
+	.loc	2 233 15
+	add.f32 	%f57, %f55, %f56;
+$L__tmp45:
+	.loc	2 243 36
+	setp.eq.s32 	%p14, %r26, 0;
+	and.pred  	%p11, %p10, %p14;
+	mov.b32 	%r21, %f57;
+	@%p11 st.shared.b32 [ %r19 + 0 ], %r21;
+	bar.sync 	0;
+	ld.shared.f32 	%f58, [global_smem];
+	ld.shared.f32 	%f59, [global_smem+16];
+	ld.shared.f32 	%f60, [global_smem+32];
+	ld.shared.f32 	%f61, [global_smem+48];
+$L__tmp46:
+	.loc	1 35 28
+	bar.sync 	0;
+	st.shared.f32 	[global_smem], %f58;
+	st.shared.f32 	[global_smem+4], %f59;
+	st.shared.f32 	[global_smem+8], %f60;
+	st.shared.f32 	[global_smem+12], %f61;
+	bar.sync 	0;
+	shl.b32 	%r79, %r26, 2;
+	add.s32 	%r80, %r74, %r79;
+	.loc	1 36 20
+	shr.s32 	%r82, %r30, 31;
+	shr.u32 	%r83, %r82, 24;
+	add.s32 	%r84, %r30, %r83;
+	shr.s32 	%r85, %r84, 8;
+	and.b32  	%r86, %r84, -256;
+	sub.s32 	%r87, %r30, %r86;
+	.loc	1 38 30
+	mul.wide.s32 	%rd9, %r85, 8;
+	add.s64 	%rd3, %rd6, %rd9;
+	.loc	1 45 55
+	ld.shared.u32 	%r23, [%r80];
+	mov.pred 	%p12, -1;
+	.loc	1 38 35
+	mov.u64 %rd2, 0x0;
+	@%p12 ld.global.L1::evict_last.b64 { %rd2 }, [ %rd3 + 0 ];
+	.loc	1 41 32
+	shr.u64 	%rd10, %rd2, 54;
+	and.b64  	%rd11, %rd10, 512;
+	add.s64 	%rd12, %rd11, %rd2;
+	.loc	1 45 30
+	shl.b64 	%rd13, %rd12, 10;
+	add.s64 	%rd14, %rd7, %rd13;
+	mul.wide.s32 	%rd15, %r87, 4;
+	add.s64 	%rd4, %rd14, %rd15;
+	.loc	1 45 55
+	bfe.u32 	%r88, %r24, 2, 3;
+	shl.b32 	%r89, %r27, 3;
+	or.b32  	%r90, %r89, %r88;
+	setp.eq.s32 	%p13, %r90, 0;
+	mov.u32 %r22, 0x0;
+	@%p13 atom.global.gpu.acq_rel.add.f32 %r22, [ %rd4 + 0 ], %r23;
+	.loc	1 45 4
+	ret;
+$L__tmp47:
+$L__func_end0:
+}
+	.file	1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 264
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 54
+.b8 105
+.b8 107
+.b8 53
+.b8 118
+.b8 120
+.b8 55
+.b8 112
+.b8 50
+.b8 50
+.b8 102
+.b8 112
+.b8 107
+.b8 52
+.b8 100
+.b8 99
+.b8 118
+.b8 104
+.b8 53
+.b8 53
+.b8 122
+.b8 105
+.b8 109
+.b8 119
+.b8 52
+.b8 116
+.b8 53
+.b8 110
+.b8 114
+.b8 53
+.b8 122
+.b8 110
+.b8 50
+.b8 98
+.b8 55
+.b8 105
+.b8 110
+.b8 117
+.b8 106
+.b8 120
+.b8 106
+.b8 97
+.b8 117
+.b8 120
+.b8 115
+.b8 104
+.b8 108
+.b8 106
+.b8 117
+.b8 109
+.b8 109
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 54
+.b8 105
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp46
+.b8 2
+.b8 35
+.b8 25
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp45
+.b8 2
+.b8 35
+.b8 25
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp45
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 268
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 268
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,60 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<4x1xi64, #blocked>
+    %cst_0 = arith.constant dense<0> : tensor<4x1xi64, #blocked>
+    %cst_1 = arith.constant dense<512> : tensor<4x1xi64, #blocked>
+    %cst_2 = arith.constant dense<256> : tensor<4x1xi32, #blocked>
+    %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
+    %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked1>
+    %cst_6 = arith.constant dense<true> : tensor<4x1xi1, #blocked>
+    %c4_i32 = arith.constant 4 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c4_i32 : i32
+    %2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4x1xi32, #blocked1>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xi32, #blocked>
+    %6 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked1>
+    %7 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked>
+    %8 = arith.addi %6, %4 : tensor<4x1xi32, #blocked1>
+    %9 = arith.addi %7, %5 : tensor<4x1xi32, #blocked>
+    %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
+    %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
+    %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
+    %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
+    %14 = tt.broadcast %8 : (tensor<4x1xi32, #blocked1>) -> tensor<4x128xi32, #blocked1>
+    %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<4x128xi32, #blocked1>
+    %16 = arith.addi %14, %15 : tensor<4x128xi32, #blocked1>
+    %17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<4x128x!tt.ptr<f32, 1>, #blocked1>
+    %18 = tt.addptr %17, %16 : tensor<4x128x!tt.ptr<f32, 1>, #blocked1>, tensor<4x128xi32, #blocked1>
+    %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<4x128xi1, #blocked1>
+    %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32, #blocked1>
+    %21 = arith.addf %20, %cst_5 : tensor<4x128xf32, #blocked1>
+    %22 = arith.select %19, %21, %cst_5 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1>
+    %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %40 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %40 : f32
+    }) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %24 = triton_gpu.convert_layout %23 : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xf32, #blocked>
+    %26 = arith.divsi %9, %cst_2 : tensor<4x1xi32, #blocked>
+    %27 = arith.remsi %9, %cst_2 : tensor<4x1xi32, #blocked>
+    %28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<4x1x!tt.ptr<i64, 1>, #blocked>
+    %29 = tt.addptr %28, %26 : tensor<4x1x!tt.ptr<i64, 1>, #blocked>, tensor<4x1xi32, #blocked>
+    %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64, #blocked>
+    %31 = arith.addi %30, %cst_1 : tensor<4x1xi64, #blocked>
+    %32 = arith.cmpi slt, %30, %cst_0 : tensor<4x1xi64, #blocked>
+    %33 = arith.select %32, %31, %30 : tensor<4x1xi1, #blocked>, tensor<4x1xi64, #blocked>
+    %34 = arith.muli %33, %cst : tensor<4x1xi64, #blocked>
+    %35 = arith.extsi %27 : tensor<4x1xi32, #blocked> to tensor<4x1xi64, #blocked>
+    %36 = arith.addi %35, %34 : tensor<4x1xi64, #blocked>
+    %37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<4x1x!tt.ptr<f32, 1>, #blocked>
+    %38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f32, 1>, #blocked>, tensor<4x1xi64, #blocked>
+    %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr<f32, 1>, #blocked>, tensor<4x1xf32, #blocked>, tensor<4x1xi1, #blocked>) -> tensor<4x1xf32, #blocked>
+    tt.return
+  }
+}

.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir ADDED Viewed

	@@ -0,0 +1,53 @@

+module {
+  tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<4x1xi64>
+    %cst_0 = arith.constant dense<0> : tensor<4x1xi64>
+    %cst_1 = arith.constant dense<512> : tensor<4x1xi64>
+    %cst_2 = arith.constant dense<true> : tensor<4x1xi1>
+    %cst_3 = arith.constant dense<256> : tensor<4x1xi32>
+    %cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
+    %cst_5 = arith.constant dense<120> : tensor<1x128xi32>
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<4x128xf32>
+    %c4_i32 = arith.constant 4 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c4_i32 : i32
+    %2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32>) -> tensor<4x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<4x1xi32>
+    %5 = arith.addi %4, %3 : tensor<4x1xi32>
+    %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
+    %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
+    %9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
+    %10 = tt.broadcast %5 : (tensor<4x1xi32>) -> tensor<4x128xi32>
+    %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<4x128xi32>
+    %12 = arith.addi %10, %11 : tensor<4x128xi32>
+    %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<4x128x!tt.ptr<f32, 1>>
+    %14 = tt.addptr %13, %12 : tensor<4x128x!tt.ptr<f32, 1>>, tensor<4x128xi32>
+    %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<4x128xi1>
+    %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32>
+    %17 = arith.addf %16, %cst_6 : tensor<4x128xf32>
+    %18 = arith.select %15, %17, %cst_6 : tensor<4x128xi1>, tensor<4x128xf32>
+    %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %35 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %35 : f32
+    }) : (tensor<4x128xf32>) -> tensor<4xf32>
+    %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<4xf32>) -> tensor<4x1xf32>
+    %21 = arith.divsi %5, %cst_3 : tensor<4x1xi32>
+    %22 = arith.remsi %5, %cst_3 : tensor<4x1xi32>
+    %23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<4x1x!tt.ptr<i64, 1>>
+    %24 = tt.addptr %23, %21 : tensor<4x1x!tt.ptr<i64, 1>>, tensor<4x1xi32>
+    %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64>
+    %26 = arith.addi %25, %cst_1 : tensor<4x1xi64>
+    %27 = arith.cmpi slt, %25, %cst_0 : tensor<4x1xi64>
+    %28 = arith.select %27, %26, %25 : tensor<4x1xi1>, tensor<4x1xi64>
+    %29 = arith.muli %28, %cst : tensor<4x1xi64>
+    %30 = arith.extsi %22 : tensor<4x1xi32> to tensor<4x1xi64>
+    %31 = arith.addi %30, %29 : tensor<4x1xi64>
+    %32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<4x1x!tt.ptr<f32, 1>>
+    %33 = tt.addptr %32, %31 : tensor<4x1x!tt.ptr<f32, 1>>, tensor<4x1xi64>
+    %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr<f32, 1>>, tensor<4x1xf32>, tensor<4x1xi1>) -> tensor<4x1xf32>
+    tt.return
+  }
+}

.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.cubin ADDED Viewed

Binary file (7.07 kB). View file

.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir ADDED Viewed

	@@ -0,0 +1,162 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = and i32 %4, 127, !dbg !8
+  %6 = shl nuw nsw i32 %5, 3, !dbg !8
+  %7 = shl nuw nsw i32 %5, 2, !dbg !8
+  %8 = or i32 %7, 512, !dbg !8
+  %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9
+  %10 = shl i32 %9, 10, !dbg !10
+  %11 = or i32 %10, %6, !dbg !11
+  %12 = or i32 %10, %7, !dbg !11
+  %13 = or i32 %10, %8, !dbg !11
+  %14 = sext i32 %11 to i64, !dbg !12
+  %15 = getelementptr i16, ptr addrspace(1) %0, i64 %14, !dbg !12
+  %16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 true) #2, !dbg !13
+  %17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !13
+  %18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !13
+  %19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !13
+  %20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !13
+  %21 = trunc i32 %17 to i16, !dbg !13
+  %extelt.offset = lshr i32 %17, 16, !dbg !13
+  %22 = trunc i32 %extelt.offset to i16, !dbg !13
+  %23 = trunc i32 %18 to i16, !dbg !13
+  %extelt.offset1 = lshr i32 %18, 16, !dbg !13
+  %24 = trunc i32 %extelt.offset1 to i16, !dbg !13
+  %25 = trunc i32 %19 to i16, !dbg !13
+  %extelt.offset2 = lshr i32 %19, 16, !dbg !13
+  %26 = trunc i32 %extelt.offset2 to i16, !dbg !13
+  %27 = trunc i32 %20 to i16, !dbg !13
+  %extelt.offset3 = lshr i32 %20, 16, !dbg !13
+  %28 = trunc i32 %extelt.offset3 to i16, !dbg !13
+  %29 = zext nneg i32 %6 to i64, !dbg !14
+  %30 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %29, !dbg !14
+  %31 = insertelement <1 x i16> undef, i16 %21, i64 0, !dbg !14
+  store <1 x i16> %31, ptr addrspace(3) %30, align 2, !dbg !14
+  %32 = or i32 %6, 1, !dbg !14
+  %33 = zext nneg i32 %32 to i64, !dbg !14
+  %34 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %33, !dbg !14
+  %35 = insertelement <1 x i16> undef, i16 %22, i64 0, !dbg !14
+  store <1 x i16> %35, ptr addrspace(3) %34, align 2, !dbg !14
+  %36 = or i32 %6, 2, !dbg !14
+  %37 = zext nneg i32 %36 to i64, !dbg !14
+  %38 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %37, !dbg !14
+  %39 = insertelement <1 x i16> undef, i16 %23, i64 0, !dbg !14
+  store <1 x i16> %39, ptr addrspace(3) %38, align 2, !dbg !14
+  %40 = or i32 %6, 3, !dbg !14
+  %41 = zext nneg i32 %40 to i64, !dbg !14
+  %42 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %41, !dbg !14
+  %43 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !14
+  store <1 x i16> %43, ptr addrspace(3) %42, align 2, !dbg !14
+  %44 = or i32 %6, 4, !dbg !14
+  %45 = zext nneg i32 %44 to i64, !dbg !14
+  %46 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %45, !dbg !14
+  %47 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !14
+  store <1 x i16> %47, ptr addrspace(3) %46, align 2, !dbg !14
+  %48 = or i32 %6, 5, !dbg !14
+  %49 = zext nneg i32 %48 to i64, !dbg !14
+  %50 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %49, !dbg !14
+  %51 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !14
+  store <1 x i16> %51, ptr addrspace(3) %50, align 2, !dbg !14
+  %52 = or i32 %6, 6, !dbg !14
+  %53 = zext nneg i32 %52 to i64, !dbg !14
+  %54 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %53, !dbg !14
+  %55 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !14
+  store <1 x i16> %55, ptr addrspace(3) %54, align 2, !dbg !14
+  %56 = or i32 %6, 7, !dbg !14
+  %57 = zext nneg i32 %56 to i64, !dbg !14
+  %58 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %57, !dbg !14
+  %59 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !14
+  store <1 x i16> %59, ptr addrspace(3) %58, align 2, !dbg !14
+  tail call void @llvm.nvvm.barrier0(), !dbg !14
+  %60 = zext nneg i32 %7 to i64, !dbg !14
+  %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !14
+  %62 = load i16, ptr addrspace(3) %61, align 2, !dbg !14
+  %63 = or i32 %7, 1, !dbg !14
+  %64 = zext nneg i32 %63 to i64, !dbg !14
+  %65 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %64, !dbg !14
+  %66 = load i16, ptr addrspace(3) %65, align 2, !dbg !14
+  %67 = or i32 %7, 2, !dbg !14
+  %68 = zext nneg i32 %67 to i64, !dbg !14
+  %69 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %68, !dbg !14
+  %70 = load i16, ptr addrspace(3) %69, align 2, !dbg !14
+  %71 = or i32 %7, 3, !dbg !14
+  %72 = zext nneg i32 %71 to i64, !dbg !14
+  %73 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %72, !dbg !14
+  %74 = load i16, ptr addrspace(3) %73, align 2, !dbg !14
+  %75 = zext nneg i32 %8 to i64, !dbg !14
+  %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !14
+  %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !14
+  %78 = or i32 %7, 513, !dbg !14
+  %79 = zext nneg i32 %78 to i64, !dbg !14
+  %80 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %79, !dbg !14
+  %81 = load i16, ptr addrspace(3) %80, align 2, !dbg !14
+  %82 = or i32 %7, 514, !dbg !14
+  %83 = zext nneg i32 %82 to i64, !dbg !14
+  %84 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %83, !dbg !14
+  %85 = load i16, ptr addrspace(3) %84, align 2, !dbg !14
+  %86 = or i32 %7, 515, !dbg !14
+  %87 = zext nneg i32 %86 to i64, !dbg !14
+  %88 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %87, !dbg !14
+  %89 = load i16, ptr addrspace(3) %88, align 2, !dbg !14
+  %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #2, !dbg !14
+  %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %66) #2, !dbg !14
+  %92 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #2, !dbg !14
+  %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #2, !dbg !14
+  %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !14
+  %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %81) #2, !dbg !14
+  %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #2, !dbg !14
+  %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %89) #2, !dbg !14
+  %98 = sext i32 %12 to i64, !dbg !15
+  %99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !15
+  %100 = sext i32 %13 to i64, !dbg !15
+  %101 = getelementptr float, ptr addrspace(1) %1, i64 %100, !dbg !15
+  %102 = bitcast float %90 to i32, !dbg !16
+  %103 = bitcast float %91 to i32, !dbg !16
+  %104 = bitcast float %92 to i32, !dbg !16
+  %105 = bitcast float %93 to i32, !dbg !16
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %102, i32 %103, i32 %104, i32 %105, ptr addrspace(1) %99, i1 true) #2, !dbg !16
+  %106 = bitcast float %94 to i32, !dbg !16
+  %107 = bitcast float %95 to i32, !dbg !16
+  %108 = bitcast float %96 to i32, !dbg !16
+  %109 = bitcast float %97 to i32, !dbg !16
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %106, i32 %107, i32 %108, i32 %109, ptr addrspace(1) %101, i1 true) #2, !dbg !16
+  ret void, !dbg !17
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py", directory: "/tmp/torchinductor_root/ot")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 24, column: 44, scope: !5)
+!15 = !DILocation(line: 26, column: 25, scope: !5)
+!16 = !DILocation(line: 26, column: 36, scope: !5)
+!17 = !DILocation(line: 26, column: 4, scope: !5)

.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx ADDED Viewed

	@@ -0,0 +1,338 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2de
+.extern .shared .align 1 .b8 global_smem[];
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<9>;
+	.reg .b32 	%r<37>;
+	.reg .b64 	%rd<13>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd5, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r22, %tid.x;
+	and.b32  	%r23, %r22, 127;
+	shl.b32 	%r24, %r23, 3;
+	shl.b32 	%r25, %r23, 2;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r26, %r1, 10;
+	.loc	1 21 23
+	or.b32  	%r27, %r26, %r24;
+	or.b32  	%r28, %r26, %r25;
+	.loc	1 24 30
+	mul.wide.s32 	%rd6, %r27, 2;
+	add.s64 	%rd1, %rd4, %rd6;
+	mov.pred 	%p1, -1;
+	.loc	1 24 35
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	shr.u32 	%r29, %r2, 16;
+	shr.u32 	%r30, %r3, 16;
+	shr.u32 	%r31, %r4, 16;
+	shr.u32 	%r32, %r5, 16;
+	.loc	1 24 44
+	shl.b32 	%r33, %r23, 4;
+	mov.u32 	%r34, global_smem;
+	add.s32 	%r35, %r34, %r33;
+	st.shared.u16 	[%r35], %r2;
+	st.shared.u16 	[%r35+2], %r29;
+	st.shared.u16 	[%r35+4], %r3;
+	st.shared.u16 	[%r35+6], %r30;
+	st.shared.u16 	[%r35+8], %r4;
+	st.shared.u16 	[%r35+10], %r31;
+	st.shared.u16 	[%r35+12], %r5;
+	st.shared.u16 	[%r35+14], %r32;
+	bar.sync 	0;
+	add.s32 	%r36, %r34, %r24;
+	ld.shared.u16 	%rs1, [%r36];
+	ld.shared.u16 	%rs2, [%r36+2];
+	ld.shared.u16 	%rs3, [%r36+4];
+	ld.shared.u16 	%rs4, [%r36+6];
+	ld.shared.u16 	%rs5, [%r36+1024];
+	ld.shared.u16 	%rs6, [%r36+1026];
+	ld.shared.u16 	%rs7, [%r36+1028];
+	ld.shared.u16 	%rs8, [%r36+1030];
+	cvt.f32.bf16 %r14, %rs1;
+	cvt.f32.bf16 %r15, %rs2;
+	cvt.f32.bf16 %r16, %rs3;
+	cvt.f32.bf16 %r17, %rs4;
+	cvt.f32.bf16 %r18, %rs5;
+	cvt.f32.bf16 %r19, %rs6;
+	cvt.f32.bf16 %r20, %rs7;
+	cvt.f32.bf16 %r21, %rs8;
+	.loc	1 26 25
+	mul.wide.s32 	%rd7, %r28, 4;
+	add.s64 	%rd2, %rd5, %rd7;
+	cvt.s64.s32 	%rd8, %r26;
+	cvt.u64.u32 	%rd9, %r25;
+	or.b64  	%rd10, %rd8, %rd9;
+	shl.b64 	%rd11, %rd10, 2;
+	add.s64 	%rd12, %rd5, %rd11;
+	add.s64 	%rd3, %rd12, 2048;
+	.loc	1 26 36
+	@%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
+	@%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+}
+	.file	1 "/tmp/torchinductor_root/ot/cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 111
+.b8 116
+.b8 98
+.b8 104
+.b8 101
+.b8 116
+.b8 51
+.b8 55
+.b8 118
+.b8 54
+.b8 109
+.b8 104
+.b8 53
+.b8 115
+.b8 97
+.b8 109
+.b8 113
+.b8 108
+.b8 55
+.b8 117
+.b8 120
+.b8 114
+.b8 101
+.b8 51
+.b8 104
+.b8 112
+.b8 114
+.b8 112
+.b8 110
+.b8 98
+.b8 104
+.b8 117
+.b8 118
+.b8 105
+.b8 109
+.b8 51
+.b8 102
+.b8 109
+.b8 114
+.b8 106
+.b8 112
+.b8 113
+.b8 53
+.b8 102
+.b8 103
+.b8 103
+.b8 54
+.b8 108
+.b8 119
+.b8 98
+.b8 105
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 111
+.b8 116
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx ADDED Viewed

	@@ -0,0 +1,572 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3de4de
+.extern .shared .align 1 .b8 global_smem[];
+.visible .entry triton__0d1d2d3de4de(
+	.param .u64 triton__0d1d2d3de4de_param_0,
+	.param .u64 triton__0d1d2d3de4de_param_1,
+	.param .u64 triton__0d1d2d3de4de_param_2,
+	.param .u32 triton__0d1d2d3de4de_param_3,
+	.param .u32 triton__0d1d2d3de4de_param_4
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<22>;
+	.reg .b32 	%r<98>;
+	.reg .f32 	%f<47>;
+	.reg .b64 	%rd<9>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd3, [triton__0d1d2d3de4de_param_2];
+	ld.param.u64 	%rd2, [triton__0d1d2d3de4de_param_1];
+	ld.param.u64 	%rd1, [triton__0d1d2d3de4de_param_0];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r1, %tid.x;
+	and.b32  	%r2, %r1, 31;
+	shl.b32 	%r13, %r1, 2;
+	and.b32  	%r3, %r13, 60;
+	.loc	1 24 33
+	bfe.u32 	%r4, %r1, 5, 2;
+	.loc	1 21 28
+	mov.u32 %r11, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r5, %r11, 6;
+	.loc	1 22 23
+	or.b32  	%r14, %r5, %r3;
+	.loc	1 26 20
+	shr.s32 	%r16, %r14, 31;
+	shr.u32 	%r17, %r16, 24;
+	add.s32 	%r18, %r14, %r17;
+	shr.s32 	%r19, %r18, 8;
+	.loc	1 29 36
+	mad.lo.s32 	%r20, %r19, 32512, %r14;
+	shl.b32 	%r21, %r4, 9;
+	add.s32 	%r22, %r20, %r21;
+	shl.b32 	%r23, %r1, 4;
+	and.b32  	%r24, %r23, 256;
+	add.s32 	%r96, %r22, %r24;
+	mov.f32 	%f43, 0f00000000;
+	mov.b32 	%r97, -8;
+	mov.pred 	%p1, -1;
+	mov.f32 	%f44, %f43;
+	mov.f32 	%f45, %f43;
+	mov.f32 	%f46, %f43;
+$L__BB0_1:
+	.loc	1 33 34
+	mul.wide.s32 	%rd6, %r96, 4;
+	add.s64 	%rd4, %rd1, %rd6;
+	mov.b32 	%r29, 0;
+	.loc	1 33 63
+	mov.u32 %r25, 0x0;
+	mov.u32 %r26, 0x0;
+	mov.u32 %r27, 0x0;
+	mov.u32 %r28, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd4 + 0 ];
+	@!%p1 mov.u32 %r25, %r29;
+	@!%p1 mov.u32 %r26, %r29;
+	@!%p1 mov.u32 %r27, %r29;
+	@!%p1 mov.u32 %r28, %r29;
+	.loc	1 34 34
+	add.s64 	%rd5, %rd2, %rd6;
+	.loc	1 34 63
+	mov.u32 %r33, 0x0;
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	mov.u32 %r36, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ];
+	@!%p1 mov.u32 %r33, %r29;
+	@!%p1 mov.u32 %r34, %r29;
+	@!%p1 mov.u32 %r35, %r29;
+	@!%p1 mov.u32 %r36, %r29;
+	.loc	1 33 63
+	mov.b32 	%f13, %r25;
+	mov.b32 	%f14, %r26;
+	mov.b32 	%f15, %r27;
+	mov.b32 	%f16, %r28;
+	.loc	1 34 63
+	mov.b32 	%f17, %r33;
+	mov.b32 	%f18, %r34;
+	mov.b32 	%f19, %r35;
+	mov.b32 	%f20, %r36;
+	.loc	1 38 38
+	fma.rn.f32 	%f46, %f16, %f20, %f46;
+	fma.rn.f32 	%f45, %f15, %f19, %f45;
+	fma.rn.f32 	%f44, %f14, %f18, %f44;
+	fma.rn.f32 	%f43, %f13, %f17, %f43;
+	.loc	1 29 36
+	add.s32 	%r97, %r97, 8;
+	add.s32 	%r96, %r96, 2048;
+	setp.lt.u32 	%p11, %r97, 120;
+	@%p11 bra 	$L__BB0_1;
+	.loc	1 22 44
+	and.b32  	%r58, %r1, 63;
+	.loc	1 22 23
+	or.b32  	%r59, %r5, %r58;
+$L__tmp1:
+	.loc	2 243 36
+	mov.b32 	%r60, %f43;
+	shfl.sync.bfly.b32	%r61, %r60, 16, 31, -1;
+	mov.b32 	%f21, %r61;
+$L__tmp2:
+	.loc	2 233 15
+	add.f32 	%f22, %f43, %f21;
+$L__tmp3:
+	.loc	2 243 36
+	mov.b32 	%r62, %f44;
+	shfl.sync.bfly.b32	%r63, %r62, 16, 31, -1;
+	mov.b32 	%f23, %r63;
+$L__tmp4:
+	.loc	2 233 15
+	add.f32 	%f24, %f44, %f23;
+$L__tmp5:
+	.loc	2 243 36
+	mov.b32 	%r64, %f45;
+	shfl.sync.bfly.b32	%r65, %r64, 16, 31, -1;
+	mov.b32 	%f25, %r65;
+$L__tmp6:
+	.loc	2 233 15
+	add.f32 	%f26, %f45, %f25;
+$L__tmp7:
+	.loc	2 243 36
+	mov.b32 	%r66, %f46;
+	shfl.sync.bfly.b32	%r67, %r66, 16, 31, -1;
+	mov.b32 	%f27, %r67;
+$L__tmp8:
+	.loc	2 233 15
+	add.f32 	%f28, %f46, %f27;
+$L__tmp9:
+	.loc	2 243 36
+	setp.lt.u32 	%p12, %r2, 16;
+	shl.b32 	%r68, %r3, 2;
+	or.b32  	%r69, %r68, %r4;
+	shl.b32 	%r70, %r69, 2;
+	mov.u32 	%r71, global_smem;
+	add.s32 	%r41, %r71, %r70;
+	mov.b32 	%r42, %f22;
+	@%p12 st.shared.b32 [ %r41 + 0 ], %r42;
+	shl.b32 	%r72, %r4, 2;
+	shl.b32 	%r73, %r3, 4;
+	or.b32  	%r74, %r73, 16;
+	or.b32  	%r75, %r74, %r72;
+	add.s32 	%r43, %r71, %r75;
+	mov.b32 	%r44, %f24;
+	@%p12 st.shared.b32 [ %r43 + 0 ], %r44;
+	or.b32  	%r76, %r73, 32;
+	or.b32  	%r77, %r76, %r72;
+	add.s32 	%r45, %r71, %r77;
+	mov.b32 	%r46, %f26;
+	@%p12 st.shared.b32 [ %r45 + 0 ], %r46;
+	or.b32  	%r78, %r73, 48;
+	or.b32  	%r79, %r78, %r72;
+	add.s32 	%r47, %r71, %r79;
+	mov.b32 	%r48, %f28;
+	@%p12 st.shared.b32 [ %r47 + 0 ], %r48;
+	bar.sync 	0;
+	setp.lt.s32 	%p16, %r1, 256;
+	add.s32 	%r50, %r71, %r13;
+	@%p16 ld.shared.b32 %r49, [ %r50 + 0 ];
+	mov.b32 	%f29, %r49;
+	shfl.sync.bfly.b32	%r81, %r49, 2, 31, -1;
+	mov.b32 	%f30, %r81;
+$L__tmp10:
+	.loc	2 233 15
+	add.f32 	%f31, %f29, %f30;
+$L__tmp11:
+	.loc	2 243 36
+	mov.b32 	%r82, %f31;
+	shfl.sync.bfly.b32	%r83, %r82, 1, 31, -1;
+	mov.b32 	%f32, %r83;
+$L__tmp12:
+	.loc	2 233 15
+	add.f32 	%f33, %f31, %f32;
+$L__tmp13:
+	.loc	2 243 36
+	and.b32  	%r84, %r1, 3;
+	setp.eq.s32 	%p21, %r84, 0;
+	and.pred  	%p17, %p16, %p21;
+	mov.b32 	%r52, %f33;
+	@%p17 st.shared.b32 [ %r50 + 0 ], %r52;
+	add.s32 	%r54, %r50, 512;
+	@%p16 ld.shared.b32 %r53, [ %r54 + 0 ];
+	mov.b32 	%f34, %r53;
+	shfl.sync.bfly.b32	%r85, %r53, 2, 31, -1;
+	mov.b32 	%f35, %r85;
+$L__tmp14:
+	.loc	2 233 15
+	add.f32 	%f36, %f34, %f35;
+$L__tmp15:
+	.loc	2 243 36
+	mov.b32 	%r86, %f36;
+	shfl.sync.bfly.b32	%r87, %r86, 1, 31, -1;
+	mov.b32 	%f37, %r87;
+$L__tmp16:
+	.loc	2 233 15
+	add.f32 	%f38, %f36, %f37;
+$L__tmp17:
+	.loc	2 243 36
+	mov.b32 	%r56, %f38;
+	@%p17 st.shared.b32 [ %r54 + 0 ], %r56;
+	bar.sync 	0;
+	add.s32 	%r88, %r71, %r73;
+	ld.shared.f32 	%f39, [%r88];
+	add.s32 	%r89, %r71, %r74;
+	ld.shared.f32 	%f40, [%r89];
+	add.s32 	%r90, %r71, %r76;
+	ld.shared.f32 	%f41, [%r90];
+	add.s32 	%r91, %r71, %r78;
+	ld.shared.f32 	%f42, [%r91];
+$L__tmp18:
+	.loc	1 39 28
+	bar.sync 	0;
+	add.s32 	%r92, %r71, %r68;
+	st.shared.f32 	[%r92], %f39;
+	st.shared.f32 	[%r92+4], %f40;
+	st.shared.f32 	[%r92+8], %f41;
+	st.shared.f32 	[%r92+12], %f42;
+	bar.sync 	0;
+	shl.b32 	%r93, %r58, 2;
+	add.s32 	%r94, %r71, %r93;
+	ld.shared.u32 	%r57, [%r94];
+	.loc	1 40 25
+	mul.wide.s32 	%rd8, %r59, 4;
+	add.s64 	%rd7, %rd3, %rd8;
+	.loc	1 40 36
+	and.b32  	%r95, %r1, 64;
+	setp.eq.s32 	%p20, %r95, 0;
+	@%p20 st.global.b32 [ %rd7 + 0 ], { %r57 };
+	.loc	1 40 4
+	ret;
+$L__tmp19:
+$L__func_end0:
+}
+	.file	1 "/tmp/torchinductor_root/qd/cqdvltndxc7vwj5j5dnsb73tk763gajftjwvmbfq7i6sitk5gwoy.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 266
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 113
+.b8 100
+.b8 118
+.b8 108
+.b8 116
+.b8 110
+.b8 100
+.b8 120
+.b8 99
+.b8 55
+.b8 118
+.b8 119
+.b8 106
+.b8 53
+.b8 106
+.b8 53
+.b8 100
+.b8 110
+.b8 115
+.b8 98
+.b8 55
+.b8 51
+.b8 116
+.b8 107
+.b8 55
+.b8 54
+.b8 51
+.b8 103
+.b8 97
+.b8 106
+.b8 102
+.b8 116
+.b8 106
+.b8 119
+.b8 118
+.b8 109
+.b8 98
+.b8 102
+.b8 113
+.b8 55
+.b8 105
+.b8 54
+.b8 115
+.b8 105
+.b8 116
+.b8 107
+.b8 53
+.b8 103
+.b8 119
+.b8 111
+.b8 121
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 113
+.b8 100
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp18
+.b8 2
+.b8 39
+.b8 25
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp17
+.b8 2
+.b8 39
+.b8 25
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp17
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 270
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 270
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin ADDED Viewed

Binary file (16.5 kB). View file

.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir ADDED Viewed

	@@ -0,0 +1,243 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@global_smem = external addrspace(3) global [0 x i8]
+define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %7 = and i32 %6, 31, !dbg !8
+  %8 = lshr i32 %6, 5, !dbg !8
+  %9 = shl i32 %6, 2, !dbg !8
+  %10 = and i32 %9, 60, !dbg !8
+  %11 = and i32 %8, 3, !dbg !9
+  %12 = lshr i32 %7, 4, !dbg !9
+  %13 = shl nuw nsw i32 %11, 1, !dbg !9
+  %14 = or i32 %13, %12, !dbg !9
+  %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
+  %16 = shl i32 %15, 6, !dbg !11
+  %17 = or i32 %16, %10, !dbg !12
+  %.frozen = freeze i32 %17
+  %18 = sdiv i32 %.frozen, 256, !dbg !13
+  %19 = mul i32 %18, 256
+  %.decomposed = sub i32 %.frozen, %19
+  %20 = shl i32 %18, 15, !dbg !14
+  %21 = add i32 %20, %.decomposed
+  br label %22, !dbg !15
+22:                                               ; preds = %5, %22
+  %23 = phi i32 [ 0, %5 ], [ %58, %22 ]
+  %24 = phi <4 x float> [ zeroinitializer, %5 ], [ %57, %22 ]
+  %25 = or i32 %23, %14, !dbg !16
+  %26 = shl i32 %25, 8, !dbg !17
+  %27 = add i32 %21, %26, !dbg !18
+  %28 = sext i32 %27 to i64, !dbg !19
+  %29 = getelementptr i16, ptr addrspace(1) %0, i64 %28, !dbg !19
+  %30 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
+  %31 = extractvalue { i32, i32 } %30, 0, !dbg !20
+  %32 = extractvalue { i32, i32 } %30, 1, !dbg !20
+  %33 = trunc i32 %31 to i16, !dbg !20
+  %extelt.offset = lshr i32 %31, 16, !dbg !20
+  %34 = trunc i32 %extelt.offset to i16, !dbg !20
+  %35 = trunc i32 %32 to i16, !dbg !20
+  %extelt.offset1 = lshr i32 %32, 16, !dbg !20
+  %36 = trunc i32 %extelt.offset1 to i16, !dbg !20
+  %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #3, !dbg !21
+  %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #3, !dbg !21
+  %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #3, !dbg !21
+  %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #3, !dbg !21
+  %41 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !22
+  %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !23
+  %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !23
+  %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !23
+  %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !23
+  %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !23
+  %47 = insertelement <4 x i32> poison, i32 %43, i64 0, !dbg !23
+  %48 = insertelement <4 x i32> %47, i32 %44, i64 1, !dbg !23
+  %49 = insertelement <4 x i32> %48, i32 %45, i64 2, !dbg !23
+  %50 = insertelement <4 x i32> %49, i32 %46, i64 3, !dbg !23
+  %51 = bitcast <4 x i32> %50 to <4 x float>, !dbg !23
+  %52 = insertelement <4 x float> poison, float %37, i64 0, !dbg !24
+  %53 = insertelement <4 x float> %52, float %38, i64 1, !dbg !24
+  %54 = insertelement <4 x float> %53, float %39, i64 2, !dbg !24
+  %55 = insertelement <4 x float> %54, float %40, i64 3, !dbg !24
+  %56 = fmul <4 x float> %55, %51, !dbg !24
+  %57 = fadd <4 x float> %24, %56, !dbg !25
+  %58 = add nuw nsw i32 %23, 8, !dbg !15
+  %59 = icmp ult i32 %23, 120, !dbg !15
+  br i1 %59, label %22, label %60, !dbg !15
+60:                                               ; preds = %22
+  %61 = and i32 %6, 63, !dbg !8
+  %62 = or i32 %16, %61, !dbg !12
+  %63 = or i32 %10, 3, !dbg !26
+  %64 = or i32 %10, 2, !dbg !26
+  %65 = or i32 %10, 1, !dbg !26
+  %66 = extractelement <4 x float> %57, i64 0, !dbg !26
+  %67 = bitcast float %66 to i32, !dbg !26
+  %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !26
+  %69 = bitcast i32 %68 to float, !dbg !26
+  %70 = fadd float %66, %69, !dbg !30
+  %71 = extractelement <4 x float> %57, i64 1, !dbg !26
+  %72 = bitcast float %71 to i32, !dbg !26
+  %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !26
+  %74 = bitcast i32 %73 to float, !dbg !26
+  %75 = fadd float %71, %74, !dbg !30
+  %76 = extractelement <4 x float> %57, i64 2, !dbg !26
+  %77 = bitcast float %76 to i32, !dbg !26
+  %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !26
+  %79 = bitcast i32 %78 to float, !dbg !26
+  %80 = fadd float %76, %79, !dbg !30
+  %81 = extractelement <4 x float> %57, i64 3, !dbg !26
+  %82 = bitcast float %81 to i32, !dbg !26
+  %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 16, i32 31), !dbg !26
+  %84 = bitcast i32 %83 to float, !dbg !26
+  %85 = fadd float %81, %84, !dbg !30
+  %86 = icmp ult i32 %7, 16, !dbg !26
+  %87 = shl nuw nsw i32 %10, 2, !dbg !26
+  %88 = or i32 %87, %11, !dbg !26
+  %89 = zext nneg i32 %88 to i64, !dbg !26
+  %90 = getelementptr float, ptr addrspace(3) @global_smem, i64 %89, !dbg !26
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %90, float %70, i1 %86) #3, !dbg !26
+  %91 = shl nuw nsw i32 %65, 2, !dbg !26
+  %92 = or i32 %91, %11, !dbg !26
+  %93 = zext nneg i32 %92 to i64, !dbg !26
+  %94 = getelementptr float, ptr addrspace(3) @global_smem, i64 %93, !dbg !26
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %75, i1 %86) #3, !dbg !26
+  %95 = shl nuw nsw i32 %64, 2, !dbg !26
+  %96 = or i32 %95, %11, !dbg !26
+  %97 = zext nneg i32 %96 to i64, !dbg !26
+  %98 = getelementptr float, ptr addrspace(3) @global_smem, i64 %97, !dbg !26
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, float %80, i1 %86) #3, !dbg !26
+  %99 = shl nuw nsw i32 %63, 2, !dbg !26
+  %100 = or i32 %99, %11, !dbg !26
+  %101 = zext nneg i32 %100 to i64, !dbg !26
+  %102 = getelementptr float, ptr addrspace(3) @global_smem, i64 %101, !dbg !26
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %102, float %85, i1 %86) #3, !dbg !26
+  tail call void @llvm.nvvm.barrier0(), !dbg !26
+  %103 = icmp slt i32 %6, 256, !dbg !26
+  %104 = sext i32 %6 to i64, !dbg !26
+  %105 = getelementptr float, ptr addrspace(3) @global_smem, i64 %104, !dbg !26
+  %106 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %105, i1 %103) #3, !dbg !26
+  %107 = bitcast float %106 to i32, !dbg !26
+  %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 2, i32 31), !dbg !26
+  %109 = bitcast i32 %108 to float, !dbg !26
+  %110 = fadd float %106, %109, !dbg !30
+  %111 = bitcast float %110 to i32, !dbg !26
+  %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 1, i32 31), !dbg !26
+  %113 = bitcast i32 %112 to float, !dbg !26
+  %114 = fadd float %110, %113, !dbg !30
+  %115 = and i32 %6, 3, !dbg !26
+  %116 = icmp eq i32 %115, 0, !dbg !26
+  %117 = and i1 %103, %116, !dbg !26
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, float %114, i1 %117) #3, !dbg !26
+  %118 = add i32 %6, 128, !dbg !26
+  %119 = sext i32 %118 to i64, !dbg !26
+  %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !26
+  %121 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %120, i1 %103) #3, !dbg !26
+  %122 = bitcast float %121 to i32, !dbg !26
+  %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !26
+  %124 = bitcast i32 %123 to float, !dbg !26
+  %125 = fadd float %121, %124, !dbg !30
+  %126 = bitcast float %125 to i32, !dbg !26
+  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 1, i32 31), !dbg !26
+  %128 = bitcast i32 %127 to float, !dbg !26
+  %129 = fadd float %125, %128, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %129, i1 %117) #3, !dbg !26
+  tail call void @llvm.nvvm.barrier0(), !dbg !26
+  %130 = zext nneg i32 %87 to i64, !dbg !26
+  %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !26
+  %132 = load float, ptr addrspace(3) %131, align 4, !dbg !26
+  %133 = zext nneg i32 %91 to i64, !dbg !26
+  %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !26
+  %135 = load float, ptr addrspace(3) %134, align 4, !dbg !26
+  %136 = zext nneg i32 %95 to i64, !dbg !26
+  %137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !26
+  %138 = load float, ptr addrspace(3) %137, align 4, !dbg !26
+  %139 = zext nneg i32 %99 to i64, !dbg !26
+  %140 = getelementptr float, ptr addrspace(3) @global_smem, i64 %139, !dbg !26
+  %141 = load float, ptr addrspace(3) %140, align 4, !dbg !26
+  tail call void @llvm.nvvm.barrier0(), !dbg !34
+  %142 = zext nneg i32 %10 to i64, !dbg !34
+  %143 = getelementptr float, ptr addrspace(3) @global_smem, i64 %142, !dbg !34
+  %144 = insertelement <1 x float> undef, float %132, i64 0, !dbg !34
+  store <1 x float> %144, ptr addrspace(3) %143, align 4, !dbg !34
+  %145 = zext nneg i32 %65 to i64, !dbg !34
+  %146 = getelementptr float, ptr addrspace(3) @global_smem, i64 %145, !dbg !34
+  %147 = insertelement <1 x float> undef, float %135, i64 0, !dbg !34
+  store <1 x float> %147, ptr addrspace(3) %146, align 4, !dbg !34
+  %148 = zext nneg i32 %64 to i64, !dbg !34
+  %149 = getelementptr float, ptr addrspace(3) @global_smem, i64 %148, !dbg !34
+  %150 = insertelement <1 x float> undef, float %138, i64 0, !dbg !34
+  store <1 x float> %150, ptr addrspace(3) %149, align 4, !dbg !34
+  %151 = zext nneg i32 %63 to i64, !dbg !34
+  %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !34
+  %153 = insertelement <1 x float> undef, float %141, i64 0, !dbg !34
+  store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !34
+  tail call void @llvm.nvvm.barrier0(), !dbg !34
+  %154 = zext nneg i32 %61 to i64, !dbg !34
+  %155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !34
+  %156 = load i32, ptr addrspace(3) %155, align 4, !dbg !34
+  %157 = sext i32 %62 to i64, !dbg !35
+  %158 = getelementptr float, ptr addrspace(1) %2, i64 %157, !dbg !35
+  %159 = and i32 %6, 64, !dbg !36
+  %160 = icmp eq i32 %159, 0, !dbg !36
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %156, ptr addrspace(1) %158, i1 %160) #3, !dbg !36
+  ret void, !dbg !37
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py", directory: "/tmp/torchinductor_root/sj")
+!3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 22, column: 44, scope: !5)
+!9 = !DILocation(line: 24, column: 33, scope: !5)
+!10 = !DILocation(line: 21, column: 28, scope: !5)
+!11 = !DILocation(line: 21, column: 33, scope: !5)
+!12 = !DILocation(line: 22, column: 23, scope: !5)
+!13 = !DILocation(line: 26, column: 20, scope: !5)
+!14 = !DILocation(line: 33, column: 57, scope: !5)
+!15 = !DILocation(line: 29, column: 36, scope: !5)
+!16 = !DILocation(line: 30, column: 27, scope: !5)
+!17 = !DILocation(line: 33, column: 44, scope: !5)
+!18 = !DILocation(line: 33, column: 51, scope: !5)
+!19 = !DILocation(line: 33, column: 34, scope: !5)
+!20 = !DILocation(line: 33, column: 63, scope: !5)
+!21 = !DILocation(line: 33, column: 115, scope: !5)
+!22 = !DILocation(line: 34, column: 34, scope: !5)
+!23 = !DILocation(line: 34, column: 63, scope: !5)
+!24 = !DILocation(line: 36, column: 22, scope: !5)
+!25 = !DILocation(line: 39, column: 38, scope: !5)
+!26 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !29)
+!27 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
+!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!29 = !DILocation(line: 40, column: 25, scope: !27)
+!30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !32)
+!31 = distinct !DILexicalBlockFile(scope: !27, file: !28, discriminator: 0)
+!32 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !33)
+!33 = !DILocation(line: 40, column: 25, scope: !31)
+!34 = !DILocation(line: 40, column: 28, scope: !5)
+!35 = !DILocation(line: 41, column: 25, scope: !5)
+!36 = !DILocation(line: 41, column: 36, scope: !5)
+!37 = !DILocation(line: 41, column: 4, scope: !5)

.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir ADDED Viewed

	@@ -0,0 +1,58 @@

+module {
+  tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
+    %c8_i32 = arith.constant 8 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant dense<32768> : tensor<64x1xi32>
+    %cst_1 = arith.constant dense<256> : tensor<1x8xi32>
+    %cst_2 = arith.constant dense<128> : tensor<1x8xi32>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
+    %cst_4 = arith.constant dense<256> : tensor<64x1xi32>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
+    %8 = arith.remsi %5, %cst_4 : tensor<64x1xi32>
+    %9 = arith.divsi %5, %cst_4 : tensor<64x1xi32>
+    %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %11 = arith.muli %9, %cst_0 : tensor<64x1xi32>
+    %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %13 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
+    %14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32>)  : i32 {
+      %20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
+      %21 = arith.addi %20, %7 : tensor<1x8xi32>
+      %22 = arith.cmpi slt, %21, %cst_2 : tensor<1x8xi32>
+      %23 = arith.muli %21, %cst_1 : tensor<1x8xi32>
+      %24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32>
+      %25 = arith.addi %10, %24 : tensor<64x8xi32>
+      %26 = arith.addi %25, %12 : tensor<64x8xi32>
+      %27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
+      %28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1>
+      %29 = tt.load %27, %28, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
+      %30 = arith.extf %29 : tensor<64x8xbf16> to tensor<64x8xf32>
+      %31 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
+      %32 = tt.load %31, %28, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
+      %33 = arith.mulf %30, %32 : tensor<64x8xf32>
+      %34 = arith.addf %arg6, %33 : tensor<64x8xf32>
+      %35 = arith.select %28, %34, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
+      scf.yield %35 : tensor<64x8xf32>
+    }
+    %16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %20 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %20 : f32
+    }) : (tensor<64x8xf32>) -> tensor<64xf32>
+    %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %18 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
+    %19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
+    tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
+    tt.return
+  }
+}

.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,49 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<1024xi32, #blocked>
+    %cst_0 = arith.constant dense<3> : tensor<1024xi32, #blocked>
+    %cst_1 = arith.constant dense<768> : tensor<1024xi32, #blocked>
+    %cst_2 = arith.constant dense<2> : tensor<1024xi32, #blocked>
+    %cst_3 = arith.constant dense<0> : tensor<1024xi32, #blocked>
+    %cst_4 = arith.constant dense<1> : tensor<1024xi32, #blocked>
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
+    %5 = arith.divsi %4, %cst : tensor<1024xi32, #blocked>
+    %6 = arith.remsi %5, %cst_0 : tensor<1024xi32, #blocked>
+    %7 = arith.remsi %4, %cst : tensor<1024xi32, #blocked>
+    %8 = arith.divsi %4, %cst_1 : tensor<1024xi32, #blocked>
+    %9 = arith.muli %8, %cst : tensor<1024xi32, #blocked>
+    %10 = arith.addi %7, %9 : tensor<1024xi32, #blocked>
+    %11 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
+    %12 = tt.addptr %11, %10 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
+    %14 = arith.extf %13 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
+    %15 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
+    %16 = tt.addptr %15, %10 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
+    %18 = arith.extf %17 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
+    %19 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
+    %20 = tt.addptr %19, %10 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
+    %22 = arith.extf %21 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
+    %23 = arith.cmpi eq, %6, %cst_2 : tensor<1024xi32, #blocked>
+    %24 = arith.select %23, %14, %cst_5 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked>
+    %25 = arith.cmpi eq, %6, %cst_4 : tensor<1024xi32, #blocked>
+    %26 = arith.select %25, %18, %cst_5 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked>
+    %27 = arith.addf %24, %26 : tensor<1024xf32, #blocked>
+    %28 = arith.cmpi eq, %6, %cst_3 : tensor<1024xi32, #blocked>
+    %29 = arith.select %28, %22, %cst_5 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked>
+    %30 = arith.addf %27, %29 : tensor<1024xf32, #blocked>
+    %31 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
+    %32 = tt.addptr %31, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %33 = arith.truncf %30 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
+    tt.store %32, %33 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
+    tt.return
+  }
+}

.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ttir ADDED Viewed

	@@ -0,0 +1,48 @@

+module {
+  tt.func public @triton__0d1d2d3d4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0> : tensor<1024xi32>
+    %cst_0 = arith.constant dense<1> : tensor<1024xi32>
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024xf32>
+    %cst_2 = arith.constant dense<2> : tensor<1024xi32>
+    %cst_3 = arith.constant dense<768> : tensor<1024xi32>
+    %cst_4 = arith.constant dense<3> : tensor<1024xi32>
+    %cst_5 = arith.constant dense<256> : tensor<1024xi32>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = arith.divsi %4, %cst_5 : tensor<1024xi32>
+    %6 = arith.remsi %5, %cst_4 : tensor<1024xi32>
+    %7 = arith.remsi %4, %cst_5 : tensor<1024xi32>
+    %8 = arith.divsi %4, %cst_3 : tensor<1024xi32>
+    %9 = arith.muli %8, %cst_5 : tensor<1024xi32>
+    %10 = arith.addi %7, %9 : tensor<1024xi32>
+    %11 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %12 = tt.addptr %11, %10 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16>
+    %14 = arith.extf %13 : tensor<1024xbf16> to tensor<1024xf32>
+    %15 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %16 = tt.addptr %15, %10 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16>
+    %18 = arith.extf %17 : tensor<1024xbf16> to tensor<1024xf32>
+    %19 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %20 = tt.addptr %19, %10 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16>
+    %22 = arith.extf %21 : tensor<1024xbf16> to tensor<1024xf32>
+    %23 = arith.cmpi eq, %6, %cst_2 : tensor<1024xi32>
+    %24 = arith.select %23, %14, %cst_1 : tensor<1024xi1>, tensor<1024xf32>
+    %25 = arith.cmpi eq, %6, %cst_0 : tensor<1024xi32>
+    %26 = arith.select %25, %18, %cst_1 : tensor<1024xi1>, tensor<1024xf32>
+    %27 = arith.addf %24, %26 : tensor<1024xf32>
+    %28 = arith.cmpi eq, %6, %cst : tensor<1024xi32>
+    %29 = arith.select %28, %22, %cst_1 : tensor<1024xi1>, tensor<1024xf32>
+    %30 = arith.addf %27, %29 : tensor<1024xf32>
+    %31 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %32 = tt.addptr %31, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %33 = arith.truncf %30 : tensor<1024xf32> to tensor<1024xbf16>
+    tt.store %32, %33 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
+    tt.return
+  }
+}

.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.llir ADDED Viewed

	@@ -0,0 +1,362 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %5 = shl i32 %4, 1, !dbg !10
+  %6 = and i32 %5, 510, !dbg !10
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
+  %8 = shl i32 %7, 9, !dbg !12
+  %9 = or i32 %8, %6, !dbg !13
+  %10 = sext i32 %9 to i64, !dbg !14
+  %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
+  %12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
+  %13 = trunc i32 %12 to i16, !dbg !15
+  %extelt.offset = lshr i32 %12, 16, !dbg !15
+  %14 = trunc i32 %extelt.offset to i16, !dbg !15
+  %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #4, !dbg !16
+  %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #4, !dbg !16
+  %17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !17
+  %18 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %17, i1 true) #4, !dbg !18
+  %19 = trunc i32 %18 to i16, !dbg !18
+  %extelt.offset1 = lshr i32 %18, 16, !dbg !18
+  %20 = trunc i32 %extelt.offset1 to i16, !dbg !18
+  %21 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !19
+  %22 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !19
+  %23 = fmul float %21, 0x3FE6A09E60000000, !dbg !20
+  %24 = fmul float %22, 0x3FE6A09E60000000, !dbg !20
+  %25 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not.i = icmp eq i32 %25, 0, !dbg !21
+  %26 = tail call float @llvm.nvvm.fabs.ftz.f(float %23) #4, !dbg !21
+  %27 = tail call float @llvm.nvvm.fabs.f(float %23) #4, !dbg !21
+  %.0.i = select i1 %.not.i, float %27, float %26, !dbg !21
+  %28 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !21
+  br i1 %28, label %__nv_fabsf.exit1.i, label %30, !dbg !21
+__nv_fabsf.exit1.i:                               ; preds = %3
+  %29 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not1.i = icmp eq i32 %29, 0, !dbg !21
+  %.01.i = select i1 %.not1.i, float %27, float %26, !dbg !21
+  br label %__internal_fmad.exit.i, !dbg !21
+30:                                               ; preds = %3
+  %31 = fmul float %23, %23, !dbg !21
+  br label %__internal_fmad.exit.i, !dbg !21
+__internal_fmad.exit.i:                           ; preds = %30, %__nv_fabsf.exit1.i
+  %32 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %30 ], !dbg !21
+  %33 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %30 ], !dbg !21
+  %34 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %30 ], !dbg !21
+  %35 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %30 ], !dbg !21
+  %36 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %30 ], !dbg !21
+  %37 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %30 ], !dbg !21
+  %38 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %30 ], !dbg !21
+  %39 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %31, %30 ], !dbg !21
+  %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not2.i = icmp eq i32 %40, 0, !dbg !21
+  %41 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %38, float %39, float %37) #4, !dbg !21
+  %42 = tail call float @llvm.nvvm.fma.rn.f(float %38, float %39, float %37) #4, !dbg !21
+  %.02.i = select i1 %.not2.i, float %42, float %41, !dbg !21
+  %43 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not3.i = icmp eq i32 %43, 0, !dbg !21
+  %44 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %39, float %36) #4, !dbg !21
+  %45 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %39, float %36) #4, !dbg !21
+  %.03.i = select i1 %.not3.i, float %45, float %44, !dbg !21
+  %46 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not4.i = icmp eq i32 %46, 0, !dbg !21
+  %47 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %39, float %35) #4, !dbg !21
+  %48 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %39, float %35) #4, !dbg !21
+  %.04.i = select i1 %.not4.i, float %48, float %47, !dbg !21
+  %49 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not5.i = icmp eq i32 %49, 0, !dbg !21
+  %50 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %39, float %34) #4, !dbg !21
+  %51 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %39, float %34) #4, !dbg !21
+  %.05.i = select i1 %.not5.i, float %51, float %50, !dbg !21
+  %52 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not6.i = icmp eq i32 %52, 0, !dbg !21
+  %53 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %39, float %33) #4, !dbg !21
+  %54 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %39, float %33) #4, !dbg !21
+  %.06.i = select i1 %.not6.i, float %54, float %53, !dbg !21
+  %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not7.i = icmp eq i32 %55, 0, !dbg !21
+  %56 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %39, float %32) #4, !dbg !21
+  %57 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %39, float %32) #4, !dbg !21
+  %.07.i = select i1 %.not7.i, float %57, float %56, !dbg !21
+  %58 = fneg float %39, !dbg !21
+  %59 = select i1 %28, float %58, float %23, !dbg !21
+  %60 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not8.i = icmp eq i32 %60, 0, !dbg !21
+  %61 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %59, float %59) #4, !dbg !21
+  %62 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %59, float %59) #4, !dbg !21
+  %.08.i = select i1 %.not8.i, float %62, float %61, !dbg !21
+  br i1 %28, label %63, label %__nv_erff.exit, !dbg !21
+63:                                               ; preds = %__internal_fmad.exit.i
+  %64 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !21
+  %65 = fsub float 1.000000e+00, %64, !dbg !21
+  %66 = bitcast float %65 to i32, !dbg !21
+  %67 = bitcast float %23 to i32, !dbg !21
+  %68 = and i32 %67, -2147483648, !dbg !21
+  %69 = or i32 %68, %66, !dbg !21
+  %70 = bitcast i32 %69 to float, !dbg !21
+  br label %__nv_erff.exit, !dbg !21
+__nv_erff.exit:                                   ; preds = %__internal_fmad.exit.i, %63
+  %r.0.i = phi float [ %70, %63 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !21
+  %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not.i2 = icmp eq i32 %71, 0, !dbg !21
+  %72 = tail call float @llvm.nvvm.fabs.ftz.f(float %24) #4, !dbg !21
+  %73 = tail call float @llvm.nvvm.fabs.f(float %24) #4, !dbg !21
+  %.0.i3 = select i1 %.not.i2, float %73, float %72, !dbg !21
+  %74 = fcmp oge float %.0.i3, 0x3FF00C1FC0000000, !dbg !21
+  br i1 %74, label %__nv_fabsf.exit1.i20, label %76, !dbg !21
+__nv_fabsf.exit1.i20:                             ; preds = %__nv_erff.exit
+  %75 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not1.i21 = icmp eq i32 %75, 0, !dbg !21
+  %.01.i22 = select i1 %.not1.i21, float %73, float %72, !dbg !21
+  br label %__internal_fmad.exit.i4, !dbg !21
+76:                                               ; preds = %__nv_erff.exit
+  %77 = fmul float %24, %24, !dbg !21
+  br label %__internal_fmad.exit.i4, !dbg !21
+__internal_fmad.exit.i4:                          ; preds = %76, %__nv_fabsf.exit1.i20
+  %78 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i20 ], [ 0x3FC06EBA60000000, %76 ], !dbg !21
+  %79 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i20 ], [ 0xBFD8127580000000, %76 ], !dbg !21
+  %80 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i20 ], [ 0x3FBCE315E0000000, %76 ], !dbg !21
+  %81 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i20 ], [ 0xBF9B837CE0000000, %76 ], !dbg !21
+  %82 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i20 ], [ 0x3F755ABD40000000, %76 ], !dbg !21
+  %83 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i20 ], [ 0xBF4AE9A400000000, %76 ], !dbg !21
+  %84 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i20 ], [ 0x3F163D2D40000000, %76 ], !dbg !21
+  %85 = phi float [ %.01.i22, %__nv_fabsf.exit1.i20 ], [ %77, %76 ], !dbg !21
+  %86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not2.i5 = icmp eq i32 %86, 0, !dbg !21
+  %87 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %84, float %85, float %83) #4, !dbg !21
+  %88 = tail call float @llvm.nvvm.fma.rn.f(float %84, float %85, float %83) #4, !dbg !21
+  %.02.i6 = select i1 %.not2.i5, float %88, float %87, !dbg !21
+  %89 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not3.i7 = icmp eq i32 %89, 0, !dbg !21
+  %90 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i6, float %85, float %82) #4, !dbg !21
+  %91 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i6, float %85, float %82) #4, !dbg !21
+  %.03.i8 = select i1 %.not3.i7, float %91, float %90, !dbg !21
+  %92 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not4.i9 = icmp eq i32 %92, 0, !dbg !21
+  %93 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i8, float %85, float %81) #4, !dbg !21
+  %94 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i8, float %85, float %81) #4, !dbg !21
+  %.04.i10 = select i1 %.not4.i9, float %94, float %93, !dbg !21
+  %95 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not5.i11 = icmp eq i32 %95, 0, !dbg !21
+  %96 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i10, float %85, float %80) #4, !dbg !21
+  %97 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i10, float %85, float %80) #4, !dbg !21
+  %.05.i12 = select i1 %.not5.i11, float %97, float %96, !dbg !21
+  %98 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not6.i13 = icmp eq i32 %98, 0, !dbg !21
+  %99 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i12, float %85, float %79) #4, !dbg !21
+  %100 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i12, float %85, float %79) #4, !dbg !21
+  %.06.i14 = select i1 %.not6.i13, float %100, float %99, !dbg !21
+  %101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not7.i15 = icmp eq i32 %101, 0, !dbg !21
+  %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i14, float %85, float %78) #4, !dbg !21
+  %103 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i14, float %85, float %78) #4, !dbg !21
+  %.07.i16 = select i1 %.not7.i15, float %103, float %102, !dbg !21
+  %104 = fneg float %85, !dbg !21
+  %105 = select i1 %74, float %104, float %24, !dbg !21
+  %106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not8.i17 = icmp eq i32 %106, 0, !dbg !21
+  %107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i16, float %105, float %105) #4, !dbg !21
+  %108 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i16, float %105, float %105) #4, !dbg !21
+  %.08.i18 = select i1 %.not8.i17, float %108, float %107, !dbg !21
+  br i1 %74, label %109, label %__nv_erff.exit23, !dbg !21
+109:                                              ; preds = %__internal_fmad.exit.i4
+  %110 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i18) #4, !dbg !21
+  %111 = fsub float 1.000000e+00, %110, !dbg !21
+  %112 = bitcast float %111 to i32, !dbg !21
+  %113 = bitcast float %24 to i32, !dbg !21
+  %114 = and i32 %113, -2147483648, !dbg !21
+  %115 = or i32 %114, %112, !dbg !21
+  %116 = bitcast i32 %115 to float, !dbg !21
+  br label %__nv_erff.exit23, !dbg !21
+__nv_erff.exit23:                                 ; preds = %__internal_fmad.exit.i4, %109
+  %r.0.i19 = phi float [ %116, %109 ], [ %.08.i18, %__internal_fmad.exit.i4 ], !dbg !21
+  %117 = fadd float %r.0.i, 1.000000e+00, !dbg !22
+  %118 = fadd float %r.0.i19, 1.000000e+00, !dbg !22
+  %119 = fmul float %117, 5.000000e-01, !dbg !23
+  %120 = fmul float %118, 5.000000e-01, !dbg !23
+  %121 = fmul float %21, %21, !dbg !24
+  %122 = fmul float %22, %22, !dbg !24
+  %123 = fmul float %121, -5.000000e-01, !dbg !25
+  %124 = fmul float %122, -5.000000e-01, !dbg !25
+  %125 = fmul float %123, 0x3FF7154760000000, !dbg !26
+  %126 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %125) #4, !dbg !26
+  %127 = fmul float %124, 0x3FF7154760000000, !dbg !26
+  %128 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %127) #4, !dbg !26
+  %129 = fmul float %126, 0x3FD9884540000000, !dbg !27
+  %130 = fmul float %128, 0x3FD9884540000000, !dbg !27
+  %131 = fmul float %21, %129, !dbg !28
+  %132 = fmul float %22, %130, !dbg !28
+  %133 = fadd float %119, %131, !dbg !29
+  %134 = fadd float %120, %132, !dbg !29
+  %135 = fmul float %15, %133, !dbg !30
+  %136 = fmul float %16, %134, !dbg !30
+  %137 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %135) #4, !dbg !31
+  %138 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %136) #4, !dbg !31
+  %139 = insertelement <2 x i16> undef, i16 %137, i64 0, !dbg !31
+  %140 = insertelement <2 x i16> %139, i16 %138, i64 1, !dbg !31
+  %141 = bitcast <2 x i16> %140 to i32, !dbg !31
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %141, ptr addrspace(1) %11, i1 true) #4, !dbg !31
+  ret void, !dbg !32
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: alwaysinline nounwind
+define float @__nv_erff(float %a) local_unnamed_addr #1 {
+__nv_fabsf.exit:
+  %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not = icmp eq i32 %0, 0
+  %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
+  %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
+  %.0 = select i1 %.not, float %2, float %1
+  %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
+  br i1 %3, label %__nv_fabsf.exit1, label %5
+__nv_fabsf.exit1:                                 ; preds = %__nv_fabsf.exit
+  %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not1 = icmp eq i32 %4, 0
+  %.01 = select i1 %.not1, float %2, float %1
+  br label %__internal_fmad.exit
+5:                                                ; preds = %__nv_fabsf.exit
+  %6 = fmul float %a, %a
+  br label %__internal_fmad.exit
+__internal_fmad.exit:                             ; preds = %5, %__nv_fabsf.exit1
+  %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
+  %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
+  %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
+  %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
+  %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
+  %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
+  %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
+  %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
+  %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not2 = icmp eq i32 %15, 0
+  %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
+  %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
+  %.02 = select i1 %.not2, float %17, float %16
+  %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not3 = icmp eq i32 %18, 0
+  %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
+  %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
+  %.03 = select i1 %.not3, float %20, float %19
+  %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not4 = icmp eq i32 %21, 0
+  %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
+  %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
+  %.04 = select i1 %.not4, float %23, float %22
+  %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not5 = icmp eq i32 %24, 0
+  %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
+  %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
+  %.05 = select i1 %.not5, float %26, float %25
+  %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not6 = icmp eq i32 %27, 0
+  %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
+  %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
+  %.06 = select i1 %.not6, float %29, float %28
+  %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not7 = icmp eq i32 %30, 0
+  %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
+  %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
+  %.07 = select i1 %.not7, float %32, float %31
+  %33 = fneg float %14
+  %34 = select i1 %3, float %33, float %a
+  %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not8 = icmp eq i32 %35, 0
+  %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
+  %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
+  %.08 = select i1 %.not8, float %37, float %36
+  br i1 %3, label %38, label %46
+38:                                               ; preds = %__internal_fmad.exit
+  %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
+  %40 = fsub float 1.000000e+00, %39
+  %41 = bitcast float %40 to i32
+  %42 = bitcast float %a to i32
+  %43 = and i32 %42, -2147483648
+  %44 = or i32 %43, %41
+  %45 = bitcast i32 %44 to float
+  br label %46
+46:                                               ; preds = %38, %__internal_fmad.exit
+  %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
+  ret float %r.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fabs.ftz.f(float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fabs.f(float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py", directory: "/tmp/torchinductor_root/5j")
+!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 21, column: 36, scope: !7)
+!11 = !DILocation(line: 20, column: 28, scope: !7)
+!12 = !DILocation(line: 20, column: 33, scope: !7)
+!13 = !DILocation(line: 21, column: 23, scope: !7)
+!14 = !DILocation(line: 24, column: 34, scope: !7)
+!15 = !DILocation(line: 24, column: 39, scope: !7)
+!16 = !DILocation(line: 24, column: 48, scope: !7)
+!17 = !DILocation(line: 25, column: 30, scope: !7)
+!18 = !DILocation(line: 25, column: 35, scope: !7)
+!19 = !DILocation(line: 25, column: 44, scope: !7)
+!20 = !DILocation(line: 29, column: 18, scope: !7)
+!21 = !DILocation(line: 30, column: 23, scope: !7)
+!22 = !DILocation(line: 32, column: 18, scope: !7)
+!23 = !DILocation(line: 34, column: 19, scope: !7)
+!24 = !DILocation(line: 35, column: 19, scope: !7)
+!25 = !DILocation(line: 37, column: 20, scope: !7)
+!26 = !DILocation(line: 38, column: 19, scope: !7)
+!27 = !DILocation(line: 40, column: 20, scope: !7)
+!28 = !DILocation(line: 41, column: 19, scope: !7)
+!29 = !DILocation(line: 42, column: 20, scope: !7)
+!30 = !DILocation(line: 43, column: 19, scope: !7)
+!31 = !DILocation(line: 45, column: 40, scope: !7)
+!32 = !DILocation(line: 45, column: 4, scope: !7)

.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir ADDED Viewed

	@@ -0,0 +1,278 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@global_smem = external addrspace(3) global [0 x i8]
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !5 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %10 = and i32 %9, 31, !dbg !8
+  %11 = lshr i32 %9, 5, !dbg !8
+  %12 = and i32 %11, 1, !dbg !8
+  %urem = shl i32 %9, 2, !dbg !8
+  %13 = and i32 %urem, 252, !dbg !8
+  %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
+  %15 = shl i32 %14, 8, !dbg !10
+  %16 = or i32 %15, %13, !dbg !11
+  %17 = sext i32 %16 to i64, !dbg !12
+  %18 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !12
+  %19 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
+  %20 = extractvalue { i32, i32 } %19, 0, !dbg !13
+  %21 = extractvalue { i32, i32 } %19, 1, !dbg !13
+  %22 = trunc i32 %20 to i16, !dbg !13
+  %extelt.offset = lshr i32 %20, 16, !dbg !13
+  %23 = trunc i32 %extelt.offset to i16, !dbg !13
+  %24 = trunc i32 %21 to i16, !dbg !13
+  %extelt.offset1 = lshr i32 %21, 16, !dbg !13
+  %25 = trunc i32 %extelt.offset1 to i16, !dbg !13
+  %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #3, !dbg !14
+  %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14
+  %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14
+  %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14
+  %30 = zext nneg i32 %13 to i64, !dbg !15
+  %31 = getelementptr float, ptr addrspace(1) %2, i64 %30, !dbg !15
+  %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %31, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
+  %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !16
+  %34 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !16
+  %35 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !16
+  %36 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !16
+  %37 = bitcast i32 %33 to float, !dbg !16
+  %38 = bitcast i32 %34 to float, !dbg !16
+  %39 = bitcast i32 %35 to float, !dbg !16
+  %40 = bitcast i32 %36 to float, !dbg !16
+  %41 = getelementptr float, ptr addrspace(1) %3, i64 %17, !dbg !17
+  %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
+  %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !18
+  %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !18
+  %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !18
+  %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !18
+  %47 = bitcast i32 %43 to float, !dbg !18
+  %48 = bitcast i32 %44 to float, !dbg !18
+  %49 = bitcast i32 %45 to float, !dbg !18
+  %50 = bitcast i32 %46 to float, !dbg !18
+  %51 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !19
+  %52 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %51, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
+  %53 = extractvalue { i32, i32, i32, i32 } %52, 0, !dbg !20
+  %54 = extractvalue { i32, i32, i32, i32 } %52, 1, !dbg !20
+  %55 = extractvalue { i32, i32, i32, i32 } %52, 2, !dbg !20
+  %56 = extractvalue { i32, i32, i32, i32 } %52, 3, !dbg !20
+  %57 = bitcast i32 %53 to float, !dbg !20
+  %58 = bitcast i32 %54 to float, !dbg !20
+  %59 = bitcast i32 %55 to float, !dbg !20
+  %60 = bitcast i32 %56 to float, !dbg !20
+  %61 = sext i32 %14 to i64, !dbg !21
+  %62 = getelementptr float, ptr addrspace(1) %4, i64 %61, !dbg !21
+  %63 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
+  %64 = bitcast i32 %63 to float, !dbg !22
+  %65 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
+  %66 = bitcast i32 %65 to float, !dbg !22
+  %67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
+  %68 = bitcast i32 %67 to float, !dbg !22
+  %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
+  %70 = bitcast i32 %69 to float, !dbg !22
+  %71 = fmul float %26, %37, !dbg !23
+  %72 = fmul float %27, %38, !dbg !23
+  %73 = fmul float %28, %39, !dbg !23
+  %74 = fmul float %29, %40, !dbg !23
+  %75 = fadd float %71, %72, !dbg !24
+  %76 = fadd float %73, %75, !dbg !24
+  %77 = fadd float %74, %76, !dbg !24
+  %78 = bitcast float %77 to i32, !dbg !30
+  %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 16, i32 31), !dbg !30
+  %80 = bitcast i32 %79 to float, !dbg !30
+  %81 = fadd float %77, %80, !dbg !24
+  %82 = bitcast float %81 to i32, !dbg !30
+  %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 8, i32 31), !dbg !30
+  %84 = bitcast i32 %83 to float, !dbg !30
+  %85 = fadd float %81, %84, !dbg !24
+  %86 = bitcast float %85 to i32, !dbg !30
+  %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 4, i32 31), !dbg !30
+  %88 = bitcast i32 %87 to float, !dbg !30
+  %89 = fadd float %85, %88, !dbg !24
+  %90 = bitcast float %89 to i32, !dbg !30
+  %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 2, i32 31), !dbg !30
+  %92 = bitcast i32 %91 to float, !dbg !30
+  %93 = fadd float %89, %92, !dbg !24
+  %94 = bitcast float %93 to i32, !dbg !30
+  %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 1, i32 31), !dbg !30
+  %96 = bitcast i32 %95 to float, !dbg !30
+  %97 = fadd float %93, %96, !dbg !24
+  %98 = icmp eq i32 %10, 0, !dbg !30
+  %99 = zext nneg i32 %12 to i64, !dbg !30
+  %100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %97, i1 %98) #3, !dbg !30
+  tail call void @llvm.nvvm.barrier0(), !dbg !30
+  %101 = icmp slt i32 %9, 2, !dbg !30
+  %102 = sext i32 %9 to i64, !dbg !30
+  %103 = getelementptr float, ptr addrspace(3) @global_smem, i64 %102, !dbg !30
+  %104 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !30
+  %105 = bitcast float %104 to i32, !dbg !30
+  %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 1, i32 31), !dbg !30
+  %107 = bitcast i32 %106 to float, !dbg !30
+  %108 = fadd float %104, %107, !dbg !24
+  %109 = and i32 %9, 1, !dbg !30
+  %110 = icmp eq i32 %109, 0, !dbg !30
+  %111 = and i1 %101, %110, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %108, i1 %111) #3, !dbg !30
+  tail call void @llvm.nvvm.barrier0(), !dbg !30
+  %112 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !30
+  %113 = fadd float %112, 0.000000e+00, !dbg !32
+  %114 = fmul float %71, %47, !dbg !36
+  %115 = fmul float %72, %48, !dbg !36
+  %116 = fmul float %73, %49, !dbg !36
+  %117 = fmul float %74, %50, !dbg !36
+  tail call void @llvm.nvvm.barrier0(), !dbg !37
+  %118 = fadd float %114, %115, !dbg !39
+  %119 = fadd float %116, %118, !dbg !39
+  %120 = fadd float %117, %119, !dbg !39
+  %121 = bitcast float %120 to i32, !dbg !37
+  %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !37
+  %123 = bitcast i32 %122 to float, !dbg !37
+  %124 = fadd float %120, %123, !dbg !39
+  %125 = bitcast float %124 to i32, !dbg !37
+  %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 8, i32 31), !dbg !37
+  %127 = bitcast i32 %126 to float, !dbg !37
+  %128 = fadd float %124, %127, !dbg !39
+  %129 = bitcast float %128 to i32, !dbg !37
+  %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 4, i32 31), !dbg !37
+  %131 = bitcast i32 %130 to float, !dbg !37
+  %132 = fadd float %128, %131, !dbg !39
+  %133 = bitcast float %132 to i32, !dbg !37
+  %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 2, i32 31), !dbg !37
+  %135 = bitcast i32 %134 to float, !dbg !37
+  %136 = fadd float %132, %135, !dbg !39
+  %137 = bitcast float %136 to i32, !dbg !37
+  %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 1, i32 31), !dbg !37
+  %139 = bitcast i32 %138 to float, !dbg !37
+  %140 = fadd float %136, %139, !dbg !39
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %140, i1 %98) #3, !dbg !37
+  tail call void @llvm.nvvm.barrier0(), !dbg !37
+  %141 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !37
+  %142 = bitcast float %141 to i32, !dbg !37
+  %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 1, i32 31), !dbg !37
+  %144 = bitcast i32 %143 to float, !dbg !37
+  %145 = fadd float %141, %144, !dbg !39
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %145, i1 %111) #3, !dbg !37
+  tail call void @llvm.nvvm.barrier0(), !dbg !37
+  %146 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37
+  %147 = fadd float %146, 0.000000e+00, !dbg !42
+  %148 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %64, float 2.560000e+02) #3, !dbg !44
+  %149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %66, float 2.560000e+02) #3, !dbg !44
+  %150 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %68, float 2.560000e+02) #3, !dbg !44
+  %151 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %70, float 2.560000e+02) #3, !dbg !44
+  %152 = fmul float %71, 2.560000e+02, !dbg !45
+  %153 = fmul float %72, 2.560000e+02, !dbg !45
+  %154 = fmul float %73, 2.560000e+02, !dbg !45
+  %155 = fmul float %74, 2.560000e+02, !dbg !45
+  %156 = fsub float %152, %113, !dbg !46
+  %157 = fsub float %153, %113, !dbg !46
+  %158 = fsub float %154, %113, !dbg !46
+  %159 = fsub float %155, %113, !dbg !46
+  %160 = fmul float %147, %47, !dbg !47
+  %161 = fmul float %147, %48, !dbg !47
+  %162 = fmul float %147, %49, !dbg !47
+  %163 = fmul float %147, %50, !dbg !47
+  %164 = fsub float %156, %160, !dbg !48
+  %165 = fsub float %157, %161, !dbg !48
+  %166 = fsub float %158, %162, !dbg !48
+  %167 = fsub float %159, %163, !dbg !48
+  %168 = fmul float %148, %164, !dbg !49
+  %169 = fmul float %148, %165, !dbg !49
+  %170 = fmul float %148, %166, !dbg !49
+  %171 = fmul float %148, %167, !dbg !49
+  %172 = fadd float %168, %57, !dbg !50
+  %173 = fadd float %169, %58, !dbg !50
+  %174 = fadd float %170, %59, !dbg !50
+  %175 = fadd float %171, %60, !dbg !50
+  %176 = bitcast float %172 to i32, !dbg !51
+  %177 = bitcast float %173 to i32, !dbg !51
+  %178 = bitcast float %174 to i32, !dbg !51
+  %179 = bitcast float %175 to i32, !dbg !51
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %176, i32 %177, i32 %178, i32 %179, ptr addrspace(1) %51, i1 true) #3, !dbg !51
+  %180 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !52
+  %181 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %172) #3, !dbg !53
+  %182 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %173) #3, !dbg !53
+  %183 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %174) #3, !dbg !53
+  %184 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %175) #3, !dbg !53
+  %185 = insertelement <2 x i16> undef, i16 %181, i64 0, !dbg !53
+  %186 = insertelement <2 x i16> %185, i16 %182, i64 1, !dbg !53
+  %187 = bitcast <2 x i16> %186 to i32, !dbg !53
+  %188 = insertelement <2 x i16> undef, i16 %183, i64 0, !dbg !53
+  %189 = insertelement <2 x i16> %188, i16 %184, i64 1, !dbg !53
+  %190 = bitcast <2 x i16> %189 to i32, !dbg !53
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %187, i32 %190, ptr addrspace(1) %180, i1 true) #3, !dbg !53
+  ret void, !dbg !54
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "crnynbmsd2yell2lpjymb46rttfaea2xjwsbxr75j54gctfgi457.py", directory: "/tmp/torchinductor_root/rn")
+!3 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 26, column: 26, scope: !5)
+!9 = !DILocation(line: 23, column: 28, scope: !5)
+!10 = !DILocation(line: 30, column: 40, scope: !5)
+!11 = !DILocation(line: 30, column: 36, scope: !5)
+!12 = !DILocation(line: 30, column: 30, scope: !5)
+!13 = !DILocation(line: 30, column: 46, scope: !5)
+!14 = !DILocation(line: 30, column: 67, scope: !5)
+!15 = !DILocation(line: 31, column: 30, scope: !5)
+!16 = !DILocation(line: 31, column: 35, scope: !5)
+!17 = !DILocation(line: 32, column: 30, scope: !5)
+!18 = !DILocation(line: 32, column: 46, scope: !5)
+!19 = !DILocation(line: 33, column: 35, scope: !5)
+!20 = !DILocation(line: 33, column: 51, scope: !5)
+!21 = !DILocation(line: 34, column: 31, scope: !5)
+!22 = !DILocation(line: 34, column: 36, scope: !5)
+!23 = !DILocation(line: 36, column: 18, scope: !5)
+!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !28)
+!25 = distinct !DILexicalBlockFile(scope: !27, file: !26, discriminator: 0)
+!26 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!27 = distinct !DILexicalBlockFile(scope: !5, file: !26, discriminator: 0)
+!28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29)
+!29 = !DILocation(line: 39, column: 57, scope: !25)
+!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
+!31 = !DILocation(line: 39, column: 57, scope: !27)
+!32 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !35)
+!33 = distinct !DILexicalBlockFile(scope: !5, file: !34, discriminator: 0)
+!34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!35 = !DILocation(line: 39, column: 44, scope: !33)
+!36 = !DILocation(line: 40, column: 18, scope: !5)
+!37 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !38)
+!38 = !DILocation(line: 43, column: 59, scope: !27)
+!39 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !40)
+!40 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !41)
+!41 = !DILocation(line: 43, column: 59, scope: !25)
+!42 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !43)
+!43 = !DILocation(line: 43, column: 45, scope: !33)
+!44 = !DILocation(line: 45, column: 20, scope: !5)
+!45 = !DILocation(line: 46, column: 19, scope: !5)
+!46 = !DILocation(line: 47, column: 20, scope: !5)
+!47 = !DILocation(line: 48, column: 19, scope: !5)
+!48 = !DILocation(line: 49, column: 20, scope: !5)
+!49 = !DILocation(line: 50, column: 20, scope: !5)
+!50 = !DILocation(line: 51, column: 20, scope: !5)
+!51 = !DILocation(line: 53, column: 51, scope: !5)
+!52 = !DILocation(line: 54, column: 25, scope: !5)
+!53 = !DILocation(line: 54, column: 48, scope: !5)
+!54 = !DILocation(line: 54, column: 4, scope: !5)