0-hero commited on
Commit
9b31431
·
verified ·
1 Parent(s): afafe68

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .cache/pip/http-v2/8/8/b/8/b/88b8bfb8f1d620e081b2b226e7936019ed96f3fadbfbd878f24e6be7.body +119 -0
  2. .cache/pip/http-v2/f/6/8/9/5/f689581dbe86ea3c1a14226c252116f97e87dde2c835d7d60fd42b59.body +280 -0
  3. .local/share/jupyter/nbextensions/code_font_size/code_font_size.js +70 -0
  4. .local/share/jupyter/nbextensions/code_prettify/README_code_prettify.md +300 -0
  5. .local/share/jupyter/nbextensions/code_prettify/demo_2to3.gif +0 -0
  6. .local/share/jupyter/nbextensions/code_prettify/isort.yaml +40 -0
  7. .local/share/jupyter/nbextensions/codefolding/codefolding.yaml +17 -0
  8. .local/share/jupyter/nbextensions/codefolding/codefolding_editor.yaml +18 -0
  9. .local/share/jupyter/nbextensions/codefolding/codefolding_indent_folded_2.png +0 -0
  10. .local/share/jupyter/nbextensions/codefolding/firstline-fold.js +14 -0
  11. .local/share/jupyter/nbextensions/codefolding/magic-fold.js +14 -0
  12. .local/share/jupyter/nbextensions/codefolding/magic-unfolded.png +0 -0
  13. .local/share/jupyter/nbextensions/codemirror_mode_extensions/main.js +11 -0
  14. .local/share/jupyter/nbextensions/collapsible_headings/main.css +130 -0
  15. .local/share/jupyter/nbextensions/collapsible_headings/screenshot.png +0 -0
  16. .local/share/jupyter/nbextensions/comment-uncomment/main.js +63 -0
  17. .local/share/jupyter/nbextensions/datestamper/readme.md +6 -0
  18. .local/share/jupyter/nbextensions/equation-numbering/button.png +0 -0
  19. .local/share/jupyter/nbextensions/execute_time/execution-timings-menu.png +0 -0
  20. .local/share/jupyter/nbextensions/exercise/exercise.yaml +28 -0
  21. .local/share/jupyter/nbextensions/exercise/main.css +13 -0
  22. .local/share/jupyter/nbextensions/exercise/readme.md +52 -0
  23. .local/share/jupyter/nbextensions/help_panel/help_panel_ext_fullscreen.png +0 -0
  24. .local/share/jupyter/nbextensions/help_panel/readme.md +15 -0
  25. .local/share/jupyter/nbextensions/hide_input/hide-input.yaml +7 -0
  26. .local/share/jupyter/nbextensions/hide_input/main.js +54 -0
  27. .local/share/jupyter/nbextensions/hide_input_all/hide_input_all_hide.png +0 -0
  28. .local/share/jupyter/nbextensions/hide_input_all/icon.png +0 -0
  29. .local/share/jupyter/nbextensions/hide_input_all/main.js +59 -0
  30. .local/share/jupyter/nbextensions/highlight_selected_word/README.md +117 -0
  31. .local/share/jupyter/nbextensions/highlight_selected_word/configurator.yaml +131 -0
  32. .local/share/jupyter/nbextensions/highlighter/demo_highlighter.ipynb +96 -0
  33. .local/share/jupyter/nbextensions/keyboard_shortcut_editor/icon.png +0 -0
  34. .local/share/jupyter/nbextensions/keyboard_shortcut_editor/readme_undefined_key.png +0 -0
  35. .local/share/jupyter/nbextensions/load_tex_macros/main.js +39 -0
  36. .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin +0 -0
  37. .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx +651 -0
  38. .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir +60 -0
  39. .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir +53 -0
  40. .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.cubin +0 -0
  41. .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir +162 -0
  42. .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx +338 -0
  43. .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx +572 -0
  44. .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin +0 -0
  45. .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir +243 -0
  46. .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir +58 -0
  47. .triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ttgir +49 -0
  48. .triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ttir +48 -0
  49. .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.llir +362 -0
  50. .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir +278 -0
.cache/pip/http-v2/8/8/b/8/b/88b8bfb8f1d620e081b2b226e7936019ed96f3fadbfbd878f24e6be7.body ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: requests
3
+ Version: 2.32.3
4
+ Summary: Python HTTP for Humans.
5
+ Home-page: https://requests.readthedocs.io
6
+ Author: Kenneth Reitz
7
+ Author-email: [email protected]
8
+ License: Apache-2.0
9
+ Project-URL: Documentation, https://requests.readthedocs.io
10
+ Project-URL: Source, https://github.com/psf/requests
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Environment :: Web Environment
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Natural Language :: English
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3 :: Only
25
+ Classifier: Programming Language :: Python :: Implementation :: CPython
26
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
27
+ Classifier: Topic :: Internet :: WWW/HTTP
28
+ Classifier: Topic :: Software Development :: Libraries
29
+ Requires-Python: >=3.8
30
+ Description-Content-Type: text/markdown
31
+ License-File: LICENSE
32
+ Requires-Dist: charset-normalizer <4,>=2
33
+ Requires-Dist: idna <4,>=2.5
34
+ Requires-Dist: urllib3 <3,>=1.21.1
35
+ Requires-Dist: certifi >=2017.4.17
36
+ Provides-Extra: security
37
+ Provides-Extra: socks
38
+ Requires-Dist: PySocks !=1.5.7,>=1.5.6 ; extra == 'socks'
39
+ Provides-Extra: use_chardet_on_py3
40
+ Requires-Dist: chardet <6,>=3.0.2 ; extra == 'use_chardet_on_py3'
41
+
42
+ # Requests
43
+
44
+ **Requests** is a simple, yet elegant, HTTP library.
45
+
46
+ ```python
47
+ >>> import requests
48
+ >>> r = requests.get('https://httpbin.org/basic-auth/user/pass', auth=('user', 'pass'))
49
+ >>> r.status_code
50
+ 200
51
+ >>> r.headers['content-type']
52
+ 'application/json; charset=utf8'
53
+ >>> r.encoding
54
+ 'utf-8'
55
+ >>> r.text
56
+ '{"authenticated": true, ...'
57
+ >>> r.json()
58
+ {'authenticated': True, ...}
59
+ ```
60
+
61
+ Requests allows you to send HTTP/1.1 requests extremely easily. There’s no need to manually add query strings to your URLs, or to form-encode your `PUT` & `POST` data — but nowadays, just use the `json` method!
62
+
63
+ Requests is one of the most downloaded Python packages today, pulling in around `30M downloads / week`— according to GitHub, Requests is currently [depended upon](https://github.com/psf/requests/network/dependents?package_id=UGFja2FnZS01NzA4OTExNg%3D%3D) by `1,000,000+` repositories. You may certainly put your trust in this code.
64
+
65
+ [![Downloads](https://static.pepy.tech/badge/requests/month)](https://pepy.tech/project/requests)
66
+ [![Supported Versions](https://img.shields.io/pypi/pyversions/requests.svg)](https://pypi.org/project/requests)
67
+ [![Contributors](https://img.shields.io/github/contributors/psf/requests.svg)](https://github.com/psf/requests/graphs/contributors)
68
+
69
+ ## Installing Requests and Supported Versions
70
+
71
+ Requests is available on PyPI:
72
+
73
+ ```console
74
+ $ python -m pip install requests
75
+ ```
76
+
77
+ Requests officially supports Python 3.8+.
78
+
79
+ ## Supported Features & Best–Practices
80
+
81
+ Requests is ready for the demands of building robust and reliable HTTP–speaking applications, for the needs of today.
82
+
83
+ - Keep-Alive & Connection Pooling
84
+ - International Domains and URLs
85
+ - Sessions with Cookie Persistence
86
+ - Browser-style TLS/SSL Verification
87
+ - Basic & Digest Authentication
88
+ - Familiar `dict`–like Cookies
89
+ - Automatic Content Decompression and Decoding
90
+ - Multi-part File Uploads
91
+ - SOCKS Proxy Support
92
+ - Connection Timeouts
93
+ - Streaming Downloads
94
+ - Automatic honoring of `.netrc`
95
+ - Chunked HTTP Requests
96
+
97
+ ## API Reference and User Guide available on [Read the Docs](https://requests.readthedocs.io)
98
+
99
+ [![Read the Docs](https://raw.githubusercontent.com/psf/requests/main/ext/ss.png)](https://requests.readthedocs.io)
100
+
101
+ ## Cloning the repository
102
+
103
+ When cloning the Requests repository, you may need to add the `-c
104
+ fetch.fsck.badTimezone=ignore` flag to avoid an error about a bad commit (see
105
+ [this issue](https://github.com/psf/requests/issues/2690) for more background):
106
+
107
+ ```shell
108
+ git clone -c fetch.fsck.badTimezone=ignore https://github.com/psf/requests.git
109
+ ```
110
+
111
+ You can also apply this setting to your global Git config:
112
+
113
+ ```shell
114
+ git config --global fetch.fsck.badTimezone ignore
115
+ ```
116
+
117
+ ---
118
+
119
+ [![Kenneth Reitz](https://raw.githubusercontent.com/psf/requests/main/ext/kr.png)](https://kennethreitz.org) [![Python Software Foundation](https://raw.githubusercontent.com/psf/requests/main/ext/psf.png)](https://www.python.org/psf)
.cache/pip/http-v2/f/6/8/9/5/f689581dbe86ea3c1a14226c252116f97e87dde2c835d7d60fd42b59.body ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: dill
3
+ Version: 0.3.8
4
+ Summary: serialize all of Python
5
+ Home-page: https://github.com/uqfoundation/dill
6
+ Author: Mike McKerns
7
+ Author-email: [email protected]
8
+ Maintainer: Mike McKerns
9
+ Maintainer-email: [email protected]
10
+ License: BSD-3-Clause
11
+ Download-URL: https://pypi.org/project/dill/#files
12
+ Project-URL: Documentation, http://dill.rtfd.io
13
+ Project-URL: Source Code, https://github.com/uqfoundation/dill
14
+ Project-URL: Bug Tracker, https://github.com/uqfoundation/dill/issues
15
+ Platform: Linux
16
+ Platform: Windows
17
+ Platform: Mac
18
+ Classifier: Development Status :: 5 - Production/Stable
19
+ Classifier: Intended Audience :: Developers
20
+ Classifier: Intended Audience :: Science/Research
21
+ Classifier: License :: OSI Approved :: BSD License
22
+ Classifier: Programming Language :: Python :: 3
23
+ Classifier: Programming Language :: Python :: 3.8
24
+ Classifier: Programming Language :: Python :: 3.9
25
+ Classifier: Programming Language :: Python :: 3.10
26
+ Classifier: Programming Language :: Python :: 3.11
27
+ Classifier: Programming Language :: Python :: 3.12
28
+ Classifier: Programming Language :: Python :: Implementation :: CPython
29
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
30
+ Classifier: Topic :: Scientific/Engineering
31
+ Classifier: Topic :: Software Development
32
+ Requires-Python: >=3.8
33
+ Provides-Extra: graph
34
+ Requires-Dist: objgraph (>=1.7.2) ; extra == 'graph'
35
+ Provides-Extra: profile
36
+ Requires-Dist: gprof2dot (>=2022.7.29) ; extra == 'profile'
37
+ Provides-Extra: readline
38
+
39
+ -----------------------------
40
+ dill: serialize all of Python
41
+ -----------------------------
42
+
43
+ About Dill
44
+ ==========
45
+
46
+ ``dill`` extends Python's ``pickle`` module for serializing and de-serializing
47
+ Python objects to the majority of the built-in Python types. Serialization
48
+ is the process of converting an object to a byte stream, and the inverse
49
+ of which is converting a byte stream back to a Python object hierarchy.
50
+
51
+ ``dill`` provides the user the same interface as the ``pickle`` module, and
52
+ also includes some additional features. In addition to pickling Python
53
+ objects, ``dill`` provides the ability to save the state of an interpreter
54
+ session in a single command. Hence, it would be feasible to save an
55
+ interpreter session, close the interpreter, ship the pickled file to
56
+ another computer, open a new interpreter, unpickle the session and
57
+ thus continue from the 'saved' state of the original interpreter
58
+ session.
59
+
60
+ ``dill`` can be used to store Python objects to a file, but the primary
61
+ usage is to send Python objects across the network as a byte stream.
62
+ ``dill`` is quite flexible, and allows arbitrary user defined classes
63
+ and functions to be serialized. Thus ``dill`` is not intended to be
64
+ secure against erroneously or maliciously constructed data. It is
65
+ left to the user to decide whether the data they unpickle is from
66
+ a trustworthy source.
67
+
68
+ ``dill`` is part of ``pathos``, a Python framework for heterogeneous computing.
69
+ ``dill`` is in active development, so any user feedback, bug reports, comments,
70
+ or suggestions are highly appreciated. A list of issues is located at
71
+ https://github.com/uqfoundation/dill/issues, with a legacy list maintained at
72
+ https://uqfoundation.github.io/project/pathos/query.
73
+
74
+
75
+ Major Features
76
+ ==============
77
+
78
+ ``dill`` can pickle the following standard types:
79
+
80
+ - none, type, bool, int, float, complex, bytes, str,
81
+ - tuple, list, dict, file, buffer, builtin,
82
+ - Python classes, namedtuples, dataclasses, metaclasses,
83
+ - instances of classes,
84
+ - set, frozenset, array, functions, exceptions
85
+
86
+ ``dill`` can also pickle more 'exotic' standard types:
87
+
88
+ - functions with yields, nested functions, lambdas,
89
+ - cell, method, unboundmethod, module, code, methodwrapper,
90
+ - methoddescriptor, getsetdescriptor, memberdescriptor, wrapperdescriptor,
91
+ - dictproxy, slice, notimplemented, ellipsis, quit
92
+
93
+ ``dill`` cannot yet pickle these standard types:
94
+
95
+ - frame, generator, traceback
96
+
97
+ ``dill`` also provides the capability to:
98
+
99
+ - save and load Python interpreter sessions
100
+ - save and extract the source code from functions and classes
101
+ - interactively diagnose pickling errors
102
+
103
+
104
+ Current Release
105
+ ===============
106
+
107
+ The latest released version of ``dill`` is available from:
108
+
109
+ https://pypi.org/project/dill
110
+
111
+ ``dill`` is distributed under a 3-clause BSD license.
112
+
113
+
114
+ Development Version
115
+ ===================
116
+
117
+ You can get the latest development version with all the shiny new features at:
118
+
119
+ https://github.com/uqfoundation
120
+
121
+ If you have a new contribution, please submit a pull request.
122
+
123
+
124
+ Installation
125
+ ============
126
+
127
+ ``dill`` can be installed with ``pip``::
128
+
129
+ $ pip install dill
130
+
131
+ To optionally include the ``objgraph`` diagnostic tool in the install::
132
+
133
+ $ pip install dill[graph]
134
+
135
+ To optionally include the ``gprof2dot`` diagnostic tool in the install::
136
+
137
+ $ pip install dill[profile]
138
+
139
+ For windows users, to optionally install session history tools::
140
+
141
+ $ pip install dill[readline]
142
+
143
+
144
+ Requirements
145
+ ============
146
+
147
+ ``dill`` requires:
148
+
149
+ - ``python`` (or ``pypy``), **>=3.8**
150
+ - ``setuptools``, **>=42**
151
+
152
+ Optional requirements:
153
+
154
+ - ``objgraph``, **>=1.7.2**
155
+ - ``gprof2dot``, **>=2022.7.29**
156
+ - ``pyreadline``, **>=1.7.1** (on windows)
157
+
158
+
159
+ Basic Usage
160
+ ===========
161
+
162
+ ``dill`` is a drop-in replacement for ``pickle``. Existing code can be
163
+ updated to allow complete pickling using::
164
+
165
+ >>> import dill as pickle
166
+
167
+ or::
168
+
169
+ >>> from dill import dumps, loads
170
+
171
+ ``dumps`` converts the object to a unique byte string, and ``loads`` performs
172
+ the inverse operation::
173
+
174
+ >>> squared = lambda x: x**2
175
+ >>> loads(dumps(squared))(3)
176
+ 9
177
+
178
+ There are a number of options to control serialization which are provided
179
+ as keyword arguments to several ``dill`` functions:
180
+
181
+ * with *protocol*, the pickle protocol level can be set. This uses the
182
+ same value as the ``pickle`` module, *DEFAULT_PROTOCOL*.
183
+ * with *byref=True*, ``dill`` to behave a lot more like pickle with
184
+ certain objects (like modules) pickled by reference as opposed to
185
+ attempting to pickle the object itself.
186
+ * with *recurse=True*, objects referred to in the global dictionary are
187
+ recursively traced and pickled, instead of the default behavior of
188
+ attempting to store the entire global dictionary.
189
+ * with *fmode*, the contents of the file can be pickled along with the file
190
+ handle, which is useful if the object is being sent over the wire to a
191
+ remote system which does not have the original file on disk. Options are
192
+ *HANDLE_FMODE* for just the handle, *CONTENTS_FMODE* for the file content
193
+ and *FILE_FMODE* for content and handle.
194
+ * with *ignore=False*, objects reconstructed with types defined in the
195
+ top-level script environment use the existing type in the environment
196
+ rather than a possibly different reconstructed type.
197
+
198
+ The default serialization can also be set globally in *dill.settings*.
199
+ Thus, we can modify how ``dill`` handles references to the global dictionary
200
+ locally or globally::
201
+
202
+ >>> import dill.settings
203
+ >>> dumps(absolute) == dumps(absolute, recurse=True)
204
+ False
205
+ >>> dill.settings['recurse'] = True
206
+ >>> dumps(absolute) == dumps(absolute, recurse=True)
207
+ True
208
+
209
+ ``dill`` also includes source code inspection, as an alternate to pickling::
210
+
211
+ >>> import dill.source
212
+ >>> print(dill.source.getsource(squared))
213
+ squared = lambda x:x**2
214
+
215
+ To aid in debugging pickling issues, use *dill.detect* which provides
216
+ tools like pickle tracing::
217
+
218
+ >>> import dill.detect
219
+ >>> with dill.detect.trace():
220
+ >>> dumps(squared)
221
+ ┬ F1: <function <lambda> at 0x7fe074f8c280>
222
+ ├┬ F2: <function _create_function at 0x7fe074c49c10>
223
+ │└ # F2 [34 B]
224
+ ├┬ Co: <code object <lambda> at 0x7fe07501eb30, file "<stdin>", line 1>
225
+ │├┬ F2: <function _create_code at 0x7fe074c49ca0>
226
+ ││└ # F2 [19 B]
227
+ │└ # Co [87 B]
228
+ ├┬ D1: <dict object at 0x7fe0750d4680>
229
+ │└ # D1 [22 B]
230
+ ├┬ D2: <dict object at 0x7fe074c5a1c0>
231
+ │└ # D2 [2 B]
232
+ ├┬ D2: <dict object at 0x7fe074f903c0>
233
+ │├┬ D2: <dict object at 0x7fe074f8ebc0>
234
+ ││└ # D2 [2 B]
235
+ │└ # D2 [23 B]
236
+ └ # F1 [180 B]
237
+
238
+ With trace, we see how ``dill`` stored the lambda (``F1``) by first storing
239
+ ``_create_function``, the underlying code object (``Co``) and ``_create_code``
240
+ (which is used to handle code objects), then we handle the reference to
241
+ the global dict (``D2``) plus other dictionaries (``D1`` and ``D2``) that
242
+ save the lambda object's state. A ``#`` marks when the object is actually stored.
243
+
244
+
245
+ More Information
246
+ ================
247
+
248
+ Probably the best way to get started is to look at the documentation at
249
+ http://dill.rtfd.io. Also see ``dill.tests`` for a set of scripts that
250
+ demonstrate how ``dill`` can serialize different Python objects. You can
251
+ run the test suite with ``python -m dill.tests``. The contents of any
252
+ pickle file can be examined with ``undill``. As ``dill`` conforms to
253
+ the ``pickle`` interface, the examples and documentation found at
254
+ http://docs.python.org/library/pickle.html also apply to ``dill``
255
+ if one will ``import dill as pickle``. The source code is also generally
256
+ well documented, so further questions may be resolved by inspecting the
257
+ code itself. Please feel free to submit a ticket on github, or ask a
258
+ question on stackoverflow (**@Mike McKerns**).
259
+ If you would like to share how you use ``dill`` in your work, please send
260
+ an email (to **mmckerns at uqfoundation dot org**).
261
+
262
+
263
+ Citation
264
+ ========
265
+
266
+ If you use ``dill`` to do research that leads to publication, we ask that you
267
+ acknowledge use of ``dill`` by citing the following in your publication::
268
+
269
+ M.M. McKerns, L. Strand, T. Sullivan, A. Fang, M.A.G. Aivazis,
270
+ "Building a framework for predictive science", Proceedings of
271
+ the 10th Python in Science Conference, 2011;
272
+ http://arxiv.org/pdf/1202.1056
273
+
274
+ Michael McKerns and Michael Aivazis,
275
+ "pathos: a framework for heterogeneous computing", 2010- ;
276
+ https://uqfoundation.github.io/project/pathos
277
+
278
+ Please see https://uqfoundation.github.io/project/pathos or
279
+ http://arxiv.org/pdf/1202.1056 for further information.
280
+
.local/share/jupyter/nbextensions/code_font_size/code_font_size.js ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Increase/decrease code font size
2
+
3
+
4
+ define([
5
+ 'base/js/namespace',
6
+ 'base/js/events'
7
+ ], function(Jupyter, events) {
8
+ var code_change_fontsize = function(doIncrease) {
9
+ var pre_css = null;
10
+ var pre_style = null;
11
+ for(i = 0; i < document.styleSheets.length; i++){
12
+ //if style sheet is custom.css
13
+ if(/.*\/custom\/custom\.css/.test(document.styleSheets[i].href)){
14
+ //pre_css now contains the style sheet custom.css
15
+ pre_css = document.styleSheets[i];
16
+ break;
17
+ }
18
+ }
19
+
20
+ for(i = 0; i < pre_css.cssRules.length; i++){
21
+ if(/\.CodeMirror pre/.test(pre_css.cssRules[i].selectorText)){
22
+ pre_style = pre_css.cssRules[i].style;
23
+ break;
24
+ }
25
+ }
26
+
27
+ if(pre_style == null){
28
+ pre_css.insertRule(".CodeMirror pre { font-size: \"14px\"; padding-bottom: \"0px\"; }", 0);
29
+ pre_style = pre_css.cssRules[0];
30
+ }
31
+
32
+ var font_size = pre_style.fontSize || "";
33
+ if(font_size == "")
34
+ font_size = 14;
35
+ else
36
+ font_size = +/\d+/.exec(font_size)[0];
37
+ font_size += (doIncrease ? +3 : -3);
38
+ font_size = (font_size < 8 ? 8 : font_size);
39
+ var padding_size = (font_size <= 14 ? 0 : (font_size - 14));
40
+
41
+ pre_style.paddingBottom = padding_size + "px";
42
+ pre_style.fontSize = font_size + "px";
43
+ };
44
+
45
+ var load_ipython_extension = function () {
46
+ Jupyter.toolbar.add_buttons_group([
47
+ /*
48
+ * Buttons to increase/decrease code font size
49
+ */
50
+ Jupyter.keyboard_manager.actions.register ({
51
+ 'help' : 'Increase code font size',
52
+ 'icon' : 'fa-search-plus',
53
+ 'handler': function () {
54
+ $( document ).ready(code_change_fontsize(true));
55
+ }
56
+ }, 'increase-code-font-size', 'code_font_size'),
57
+ Jupyter.keyboard_manager.actions.register ({
58
+ 'help' : 'Decrease code font size',
59
+ 'icon' : 'fa-search-minus',
60
+ 'handler': function () {
61
+ $( document ).ready(code_change_fontsize(false));
62
+ }
63
+ }, 'decrease-code-font-size', 'code_font_size'),
64
+
65
+ ]);
66
+ };
67
+ return {
68
+ load_ipython_extension : load_ipython_extension
69
+ };
70
+ });
.local/share/jupyter/nbextensions/code_prettify/README_code_prettify.md ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ A Code Prettifier
2
+ =================
3
+
4
+ This nbextension reformats/prettifies code in notebook code cells.
5
+
6
+ Under the hood, it uses a call to the current notebook kernel to reformat the
7
+ code.
8
+ Thus the actual prettifier package has to be callable from the current kernel
9
+ language.
10
+
11
+ With an appropriately-configured prettifier for the kernel in use, the
12
+ nbextension provides
13
+
14
+ - a toolbar button (configurable to be added or not)
15
+
16
+ - a keyboard shortcut for reformatting the current code-cell (default shortcut
17
+ is `Ctrl-L`, can also be configured not to add the keyboard shortcut).
18
+
19
+ - a keyboard shortcut for reformatting the whole notebook (default shortcut
20
+ is `Ctrl-Shift-L`, can also be configured not to add the keyboard shortcut).
21
+
22
+ Syntax shall be correct. The nbextension may also point out basic syntax errors.
23
+
24
+ ![](demo-py.gif)
25
+ ![](demo-R.gif)
26
+ ![](demo-jv.gif)
27
+
28
+
29
+ Compatible Kernels
30
+ ------------------
31
+
32
+ Example implementations are provided for prettifiers for ipython, ir and
33
+ ijavascript kernels which should work out of the box (assuming availability of
34
+ the relevant kernel-specific [prerequisites] mentioned below), but the
35
+ kernel-specific prettifier calls are configurable, so the model is applicable
36
+ to essentially any kernel language and prettifier library.
37
+
38
+ Other languages may be added as defaults in the future, but given that there
39
+ are more than 50 [kernels] available for Jupyter, it is not easily possible to
40
+ support all of them out of the box, unless people with experience in the
41
+ relevant kernels have the time to contribute code. For information on how the
42
+ reformatting takes place, and how to adapt it for your particular
43
+ kernel/prettifier, see the [options] and [internals] sections below.
44
+ If you implement a language that isn't yet provided by default, please submit a
45
+ PR or let us know to add it to the repo :)
46
+
47
+ Under the hood, this nbextension's functionality is provided by the
48
+ [KerneExecOnCells library], a shared library for creating Jupyter nbextensions
49
+ which transform code cell text using calls to the active kernel.
50
+
51
+
52
+ Prerequisites
53
+ -------------
54
+
55
+ Of course, you must have the necessary kernel-specific packages installed for
56
+ the prettifier call to work:
57
+
58
+ - for the default python implementation, the [yapf] module is required:
59
+
60
+ pip install yapf
61
+
62
+ Others you might consider using include [autopep8] - see [README_autopep8.md].
63
+
64
+ - for R, the default implementation uses the [formatR] and [jsonlite] packages:
65
+
66
+ ```r
67
+ install.packages(c("formatR", "jsonlite"), repos="http://cran.rstudio.com")
68
+ ```
69
+
70
+ - for [ijavascript], the [js-beautify] package is used:
71
+ (*Under linux, in the root of your user tree = ~*)
72
+
73
+ npm install js-beautify
74
+
75
+ Under Windows, you may then need to set the `NODE_PATH` environment variable
76
+ (see [this question on stackoverflow]) to it to `%AppData%\npm\node_modules`
77
+ (Windows 7/8/10).
78
+ To be done with it once and for all, add this as a System variable in the
79
+ Advanced tab of the System Properties dialog.
80
+
81
+
82
+ Options
83
+ -------
84
+
85
+ All options are provided by the [KerneExecOnCells library]. - see the
86
+ [internals] section below for details.
87
+ There are a few nbextension-wide options, configurable using the
88
+ [jupyter_nbextensions_configurator] or by editing the `notebook` section config
89
+ file directly.
90
+ The options are as follows:
91
+
92
+ - `code_prettify.add_toolbar_button`:
93
+ Whether to add a toolbar button to transform the selected cell(s).
94
+ Defaults to `true`.
95
+
96
+ - `code_prettify.button_icon`:
97
+ A font-awesome class defining the icon used for the toolbar button and
98
+ actions. See [fontawesome] for available icon classes.
99
+ Defaults to `fa-legal`.
100
+
101
+ - `code_prettify.button_label`:
102
+ Toolbar button label text. Also used in the actions' help text.
103
+ Defaults to `Code prettify`.
104
+
105
+ - `code_prettify.register_hotkey`:
106
+ Whether to register hotkeys to transform the selected cell(s)/whole notebook.
107
+ Defaults to `true`.
108
+
109
+ - `code_prettify.hotkeys.process_all`:
110
+ Hotkey to use to transform all the code cells in the notebook.
111
+ Defaults to `Ctrl-Shift-L`.
112
+
113
+ - `code_prettify.hotkeys.process_selected`:
114
+ Hotkey to use to transform the selected cell(s).
115
+ Defaults to `Ctrl-L`.
116
+
117
+ - `code_prettify.show_alerts_for_not_supported_kernel`:
118
+ Whether to show alerts if the kernel is not supported.
119
+ Defaults to `false`.
120
+
121
+ - `code_prettify.show_alerts_for_errors`:
122
+ Whether to show alerts for errors in the kernel calls.
123
+ Defaults to `true`.
124
+
125
+ - `code_prettify.kernel_config_map_json`:
126
+ The value of this key is a string which can be parsed into a json object
127
+ giving the config for each kernel language.
128
+
129
+ The following give the per-kernel options of the parsed json, using the
130
+ language key `python `:
131
+
132
+ * `code_prettify.kernel_config_map_json.python.library`:
133
+ String to execute in the kernel in order to load any necessary kernel
134
+ libraries.
135
+
136
+ * `code_prettify.kernel_config_map_json.python.replacements_json_to_kernel`:
137
+ a list of pairs of strings, used as arguments to javascript's
138
+ `String.replace(from, to)` to translate from a json string into a valid
139
+ representation of the same string in the kernel language. Since json
140
+ strings are particularly simple, this can often (as with the python
141
+ language) be left as the default, an empty list.
142
+
143
+ * `code_prettify.kernel_config_map_json.python.prefix` and
144
+ `code_prettify.kernel_config_map_json.python.postfix`:
145
+ Strings added as bookends to the kernel string (translated from the json
146
+ string using the replacements above) to make up the kernel prettifier call
147
+ kernel's prettifier libraries.
148
+
149
+ * `code_prettify.kernel_config_map_json.python.trim_formatted_text`:
150
+ Whether to trim whitespace from the transformed cell text. Since jupyter
151
+ cells don't usually have leading or trailing whitespace, the default
152
+ behaviour is to trim the transformed text, in order to prevent the
153
+ transform adding extra newlines at the end (a common behaviour for source
154
+ files, where having a trailing newline is often considered good practice).
155
+
156
+
157
+ Internals
158
+ ---------
159
+
160
+ Under the hood, this nbextension uses the [KerneExecOnCells library], a shared
161
+ library for creating Jupyter nbextensions which transform code cell text using
162
+ calls to the active kernel.
163
+
164
+ The model is essentially:
165
+
166
+ 1. The cell text is grabbed by client-side javascript, then turned into a json
167
+ string using javascript `JSON.stringify`. Since json-compatible strings are
168
+ a particularly simple string format, which is compatible with many other
169
+ programming languages without much modification (e.g. a valid json string
170
+ is also a valid string in python 3, and also in python 2 when prefixed with
171
+ a `u`), and easily converted for use in others (because of its simplicity).
172
+
173
+ 2. Optional regex replacements are used to translate the json-format string
174
+ into a valid kernel string. Python, R and javascript don't require this
175
+ step, but other languages may do, so it's implemented for flexibility
176
+ using the per-kernel config key `replacements_json_to_kernel`, which is a
177
+ list of pairs of arguments to javascript `String.replace`.
178
+
179
+ 3. The kernel-specific prettifier call is then composed from
180
+ `kernel_config.prefix` + `kernel_text_string` + `kernel_config.postfix` and
181
+ sent to the kernel for execution. This kernel call is expected to get the
182
+ formatted cell text _printed_ as a json-compatible string. Since most
183
+ kernel languages have json packages, this should hopefully be easy to
184
+ arrange. The reason for the printing text rather than simply displaying it,
185
+ is that it prevents us having to translate from a kernel string
186
+ representing a json string.
187
+
188
+ 4. The callback for the kernel execution in client-side javascript parses the
189
+ printed json-format string, optionally trims trailing whitespace according
190
+ to the `trim_formatted_text` key (which defaults to `true`) in the
191
+ per-kernel config, and then sets the cell text using the result.
192
+
193
+ The process is probably best illustrated using an example for the python
194
+ implementation:
195
+
196
+ 1. **At nbextension load**, the `code_prettify.kernel_config_map_json` config
197
+ option is parsed to give the json object
198
+
199
+ ```json
200
+ {
201
+ "python": {
202
+ "library": "import json\nimport yapf.yapflib.yapf_api",
203
+ "prefix": "print(json.dumps(yapf.yapflib.yapf_api.FormatCode(u",
204
+ "postfix": ")[0]))"
205
+ }
206
+ }
207
+ ```
208
+
209
+ (other kernel languages are omitted for clarity).
210
+
211
+ 2. **On kernel becoming ready**, the nbextension looks up the config for the
212
+ kernel's language (in our example, this is the `python` key of the kernel
213
+ config json object above). It then sends the kernel config's `library`
214
+ string to the kernel for execution. Thus the python implementation above
215
+ executes
216
+
217
+ ```python
218
+ import json
219
+ import yapf.yapflib.yapf_api
220
+ ```
221
+
222
+ 3. **On requesting a cell be prettified** which can happen by clicking the
223
+ toolbar, or with a (configurable) hotkey, the following happens:
224
+
225
+ Say the cell to be formatted contains the following ugly python code:
226
+
227
+ ```python
228
+ msg= 'hello '+"world"
229
+ print (
230
+ msg )
231
+ ```
232
+
233
+ Then the result of the `JSON.stringify` call will be a string containing
234
+
235
+ ```json
236
+ "msg= 'hello '+\"world\"\nprint (\n msg )"
237
+ ```
238
+
239
+ (note the opening and closing quotes). Concatenating this with the prefix &
240
+ postfix strings from the python kernel config above, gives us the kernel
241
+ code to execute. The call sent to the python kernel is therefore
242
+
243
+ ```python
244
+ print(json.dumps(yapf.yapflib.yapf_api.FormatCode(u"msg= 'hello '+\"world\"\nprint (\n msg )")[0]))
245
+ ```
246
+
247
+ 4. What gets 'printed' by the kernel (i.e. returned to the javascript stream
248
+ callback) is the following json-format string:
249
+
250
+ ```json
251
+ "msg = 'hello ' + \"world\"\nprint(msg)\n"
252
+ ```
253
+
254
+ The default is to trim whitepace from the returned prettified text, which
255
+ results in the final prettified python code for the cell:
256
+
257
+ ```python
258
+ msg = 'hello ' + "world"
259
+ print(msg)
260
+ ```
261
+
262
+
263
+ History
264
+ -------
265
+
266
+ - [@jfbercher], august 14, 2016, first version, named `yapf_ext`
267
+ - [@jfbercher], august 19, 2016, second version `code_prettify`
268
+ - introduced support for R and javascript.
269
+ - changed extension name from `yapf_ext` to `code_prettify`
270
+ - [@jcb91], december 2016
271
+ - made addition of toolbar button & hotkey configurable
272
+ - reworked to avoid regex replacements for conversion to/from kernel string
273
+ formats, in favour of json-string interchange
274
+ - made kernel-specific prettifier calls configurable, allowing support for
275
+ different prettifiers & arbitrary kernels
276
+ - improved documentation
277
+ - [@jfbercher], december 2016-january 2017
278
+ - added a configurable shortkey to reflow the whole notebook
279
+ - extracted most of the code to build a general library of functions,
280
+ `kernel_exec_on_cell.js`, which can be used for all nbextensions which
281
+ needs to exec some code (via the current kernel) on the text from cells.
282
+
283
+
284
+ [@jcb91]: https://github.com/jcb91
285
+ [@jfbercher]: https://github.com/jfbercher
286
+ [autopep8]: https://github.com/hhatto/autopep8
287
+ [formatR]: https://yihui.name/formatr
288
+ [fontawesome]: https://fontawesome.com/icons
289
+ [ijavascript]: https://n-riesco.github.io/ijavascript
290
+ [internals]: #Internals
291
+ [js-beautify]: https://github.com/beautify-web/js-beautify
292
+ [jsonlite]: https://github.com/jeroen/jsonlite
293
+ [jupyter_nbextensions_configurator]: https://github.com/Jupyter-contrib/jupyter_nbextensions_configurator
294
+ [KerneExecOnCells library]: README.md
295
+ [kernels]: https://github.com/ipython/ipython/wiki/IPython-kernels-for-other-languages
296
+ [options]: #Options
297
+ [prerequisites]: #Prerequisites
298
+ [README_autopep8.md]: README_autopep8.md
299
+ [this question on stackoverflow]: https://stackoverflow.com/questions/9587665/nodejs-cannot-find-installed-module-on-windows
300
+ [yapf]: https://github.com/google/yapf
.local/share/jupyter/nbextensions/code_prettify/demo_2to3.gif ADDED
.local/share/jupyter/nbextensions/code_prettify/isort.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Type: Jupyter Notebook Extension
2
+ Name: isort formatter
3
+ Description: Sort imports in python files using isort
4
+ Link: README_isort.md
5
+ Main: isort.js
6
+ Compatibility: Jupyter 4.x, 5.x
7
+ Parameters:
8
+
9
+ - name: isort.add_toolbar_button
10
+ description: Add a toolbar button to convert the selected cell(s)
11
+ input_type: checkbox
12
+ default: true
13
+
14
+ - name: isort.button_icon
15
+ description: |
16
+ Toolbar button icon: a font-awesome class defining the icon used for the
17
+ toolbar button. See https://fontawesome.com/icons for available icons.
18
+ input_type: text
19
+ default: 'fa-sort'
20
+
21
+ - name: isort.button_label
22
+ description: Toolbar button label text
23
+ input_type: text
24
+ default: 'Sort imports with isort'
25
+
26
+ - name: isort.kernel_config_map_json
27
+ description: |
28
+ kernel_config_map_json:
29
+ json defining library calls required to load the kernel-specific
30
+ converting modules, and the prefix & postfix for the json-format string
31
+ required to make the converting call.
32
+ input_type: textarea
33
+ default: |
34
+ {
35
+ "python": {
36
+ "library": "import json, isort\ndef _isort_refactor_cell(src):\n try:\n tree = isort.SortImports(file_contents=src).output\n except Exception:\n return src \n else:\n return str(tree)[:-1]",
37
+ "prefix": "print(json.dumps(_isort_refactor_cell(u",
38
+ "postfix": ")))"
39
+ }
40
+ }
.local/share/jupyter/nbextensions/codefolding/codefolding.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Type: Jupyter Notebook Extension
2
+ Name: Codefolding
3
+ Description: This extension enables the CodeMirror feature to allow codefolding in code cells
4
+ Link: readme.md
5
+ Icon: icon.png
6
+ Main: main.js
7
+ Compatibility: 4.x, 5.x
8
+ Parameters:
9
+ - name: codefolding_hotkey
10
+ description: Hotkey to fold/unfold code
11
+ input_type: hotkey
12
+ default: Alt-F
13
+ - name: init_delay
14
+ description: Add a delay before initializing the extension. Useful when the gutter is not being initialized correctly.
15
+ input_type: number
16
+ min: 0
17
+ default: 0
.local/share/jupyter/nbextensions/codefolding/codefolding_editor.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Type: Jupyter Notebook Extension
2
+ Name: Codefolding in Editor
3
+ Description: |
4
+ Enables the CodeMirror feature to allow codefolding in the Jupyter file
5
+ editor view.
6
+ Note that this also uses the codefolding hotkey from the codefolding
7
+ nbextension for the notebook view.
8
+ Link: readme.md
9
+ Icon: codefolding_editor.png
10
+ Main: edit.js
11
+ Compatibility: 4.x, 5.x
12
+ Parameters:
13
+ - name: init_delay
14
+ description: Add a delay before initializing the extension. Useful when the gutter is not being initialized correctly.
15
+ input_type: number
16
+ min: 0
17
+ default: 1000
18
+ Section: edit
.local/share/jupyter/nbextensions/codefolding/codefolding_indent_folded_2.png ADDED
.local/share/jupyter/nbextensions/codefolding/firstline-fold.js ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* allow folding of complete cell, if comment is in first line */
2
+ CodeMirror.registerHelper("fold", "firstline", function(cm, start) {
3
+ var mode = cm.getMode(), Token = mode.lineComment;
4
+ if (start.line == 0) {
5
+ var lineText = cm.getLine(start.line);
6
+ var found = lineText.lastIndexOf(Token,0);
7
+ if (found == 0) {
8
+ end = cm.lastLine();
9
+ return {from: CodeMirror.Pos(start.line, null),
10
+ to: CodeMirror.Pos(end, null)};
11
+ }
12
+ }
13
+ return ;
14
+ });
.local/share/jupyter/nbextensions/codefolding/magic-fold.js ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* allow folding of complete cell, if IPython magic symbol "%" is in first line */
2
+ CodeMirror.registerHelper("fold", "magic", function(cm, start) {
3
+ var mode = cm.getMode(), Token = "%%";
4
+ if (start.line == 0) {
5
+ var lineText = cm.getLine(start.line);
6
+ var found = lineText.lastIndexOf(Token,0);
7
+ if (found == 0) {
8
+ end = cm.lastLine();
9
+ return {from: CodeMirror.Pos(start.line, null),
10
+ to: CodeMirror.Pos(end, null)};
11
+ }
12
+ }
13
+ return ;
14
+ });
.local/share/jupyter/nbextensions/codefolding/magic-unfolded.png ADDED
.local/share/jupyter/nbextensions/codemirror_mode_extensions/main.js ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ define(['codemirror/lib/codemirror'], function (CodeMirror) {
2
+ "use strict";
3
+ return {
4
+ load_ipython_extension : function () {
5
+ CodeMirror.extendMode('octave', {
6
+ lineComment: '%',
7
+ fold: 'indent',
8
+ });
9
+ }
10
+ };
11
+ });
.local/share/jupyter/nbextensions/collapsible_headings/main.css ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .collapsible_headings_toggle .h1 {
2
+ font-size: 185.7%;
3
+ margin: 0.538em 0 0 0;
4
+ line-height: 1.0;
5
+ }
6
+ .collapsible_headings_toggle .h2 {
7
+ font-size: 157.1%;
8
+ margin: 0.636em 0 0 0;
9
+ line-height: 1.0;
10
+ }
11
+ .collapsible_headings_toggle .h3 {
12
+ font-size: 128.6%;
13
+ margin: 0.777em 0 0 0;
14
+ line-height: 1.0;
15
+ }
16
+ .collapsible_headings_toggle .h4,
17
+ .collapsible_headings_toggle .h5,
18
+ .collapsible_headings_toggle .h6 {
19
+ font-size: 100%;
20
+ margin: 1em 0 0 0;
21
+ line-height: 1.0;
22
+ }
23
+
24
+ .collapsible_headings_toggle.btn .h1,
25
+ .collapsible_headings_toggle.btn .h2,
26
+ .collapsible_headings_toggle.btn .h3,
27
+ .collapsible_headings_toggle.btn .h4,
28
+ .collapsible_headings_toggle.btn .h5,
29
+ .collapsible_headings_toggle.btn .h6 {
30
+ margin-top: 0;
31
+ }
32
+
33
+ .collapsible_headings_toggle .fa {
34
+ transition: transform 400ms;
35
+
36
+ /* don't support IE filter, since can't rotate 360 */
37
+ -webkit-transform: rotate(360deg);
38
+ -moz-transform: rotate(360deg);
39
+ -ms-transform: rotate(360deg);
40
+ -o-transform: rotate(360deg);
41
+ transform: rotate(360deg);
42
+ }
43
+
44
+ .collapsible_headings_collapsed .fa {
45
+ -webkit-transform: none;
46
+ -moz-transform: none;
47
+ -ms-transform: none;
48
+ -o-transform: none;
49
+ transform: none;
50
+ }
51
+
52
+ /* bracket rules */
53
+
54
+ div.cell {
55
+ position: relative;
56
+ }
57
+
58
+ .chb {
59
+ position: absolute;
60
+ top: -1px;
61
+ bottom: -1px;
62
+ left: calc(100% + 3px);
63
+ display: flex;
64
+ flex-direction: row-reverse;
65
+ justify-content: flex-start;
66
+ align-items: stretch;
67
+ }
68
+
69
+ .chb div {
70
+ margin-left: 2px;
71
+ width: 5px;
72
+ border-color: #aaa;
73
+ border-left-color: transparent;
74
+ border-style: solid;
75
+ border-width: 0 2px 0 2px;
76
+ }
77
+
78
+ .collapsible_headings_collapsed .chb .chb-start {
79
+ border-width: 5px 2px 2px 4px;
80
+ }
81
+
82
+ .chb div:hover,
83
+ .chb .chb-hover,
84
+ .jupyter-soft-selected .chb div{
85
+ border-color: #42A5F5;
86
+ border-left-color: transparent;
87
+ border-width: 0 3px 0 0;
88
+ }
89
+
90
+ .chb .chb-start {
91
+ border-top-width: 1px;
92
+ margin-top: 2px;
93
+ }
94
+
95
+ .chb .chb-end {
96
+ border-bottom-width: 1px;
97
+ margin-bottom: 2px;
98
+ }
99
+
100
+ .chb-start div:hover, .chb .chb-start.chb-hover, .jupyter-soft-selected .chb .chb-start {
101
+ border-top-width: 2px;
102
+ }
103
+
104
+ .chb-end div:hover, .chb .chb-end.chb-hover, .jupyter-soft-selected .chb .chb-end {
105
+ border-bottom-width: 2px;
106
+ }
107
+
108
+ /* ellipsis rules */
109
+ .collapsible_headings_ellipsis .rendered_html h1,
110
+ .collapsible_headings_ellipsis .rendered_html h2,
111
+ .collapsible_headings_ellipsis .rendered_html h3,
112
+ .collapsible_headings_ellipsis .rendered_html h4,
113
+ .collapsible_headings_ellipsis .rendered_html h5,
114
+ .collapsible_headings_ellipsis .rendered_html h6 {
115
+ position: relative;
116
+ padding-right: 2em;
117
+ }
118
+
119
+ .collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h1:after,
120
+ .collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h2:after,
121
+ .collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h3:after,
122
+ .collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h4:after,
123
+ .collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h5:after,
124
+ .collapsible_headings_collapsed.collapsible_headings_ellipsis .rendered_html h6:after {
125
+ position: absolute;
126
+ right: 0;
127
+ bottom: 0;
128
+ content: "[\002026]";
129
+ color: #aaa;
130
+ }
.local/share/jupyter/nbextensions/collapsible_headings/screenshot.png ADDED
.local/share/jupyter/nbextensions/comment-uncomment/main.js ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // add new configurable hotkey binding to toggle comments
2
+
3
+ define([
4
+ 'base/js/namespace',
5
+ ], function(
6
+ IPython
7
+ ) {
8
+ "use strict";
9
+
10
+ // define default config parameter values
11
+ var params = {
12
+ comment_uncomment_keybinding : 'alt-c',
13
+ comment_uncomment_indent: false,
14
+ };
15
+
16
+ // updates default params with any specified in the server's config
17
+ var update_params = function() {
18
+ var config = IPython.notebook.config;
19
+ for (var key in params){
20
+ if (config.data.hasOwnProperty(key) ){
21
+ params[key] = config.data[key];
22
+ }
23
+ }
24
+ };
25
+
26
+ var initialize = function () {
27
+ // update defaults
28
+ update_params();
29
+
30
+ // register actions with ActionHandler instance
31
+ var prefix = 'auto';
32
+ var name = 'toggle-comment';
33
+ var action = {
34
+ icon: 'fa-comment-o',
35
+ help : 'Toggle comments',
36
+ help_index : 'eb',
37
+ id : 'read_only_codecell',
38
+ handler : toggle_comment
39
+ };
40
+ var action_full_name = IPython.keyboard_manager.actions.register(action, name, prefix);
41
+
42
+ // define keyboard shortcuts
43
+ var edit_mode_shortcuts = {};
44
+ edit_mode_shortcuts[params.comment_uncomment_keybinding] = action_full_name;
45
+
46
+ // register keyboard shortcuts with keyboard_manager
47
+ IPython.notebook.keyboard_manager.edit_shortcuts.add_shortcuts(edit_mode_shortcuts);
48
+ };
49
+
50
+ var toggle_comment = function() {
51
+ var cm = IPython.notebook.get_selected_cell().code_mirror;
52
+ cm.toggleComment({ indent: params.comment_uncomment_indent });
53
+ return false;
54
+ };
55
+
56
+ var load_ipython_extension = function () {
57
+ return IPython.notebook.config.loaded.then(initialize);
58
+ };
59
+
60
+ return {
61
+ load_ipython_extension : load_ipython_extension
62
+ };
63
+ });
.local/share/jupyter/nbextensions/datestamper/readme.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Datestamper
2
+ ===========
3
+
4
+ Adds a toolbar button which pastes the current time & date into the current cell:
5
+
6
+ ![](icon.png)
.local/share/jupyter/nbextensions/equation-numbering/button.png ADDED
.local/share/jupyter/nbextensions/execute_time/execution-timings-menu.png ADDED
.local/share/jupyter/nbextensions/exercise/exercise.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Type: Jupyter Notebook Extension
2
+ Name: Exercise
3
+ Description: |
4
+ Define a group of cells as an "exercise".
5
+ The first cell is the question,
6
+ while the rest of the group from the answer or solution.
7
+ The solution can be hidden/shown by clicking on a widget added to the
8
+ question cell.
9
+ Link: readme.md
10
+ Icon: icon.png
11
+ Main: main.js
12
+ Compatibility: 4.x, 5.x
13
+ Parameters:
14
+
15
+ - name: add_button
16
+ description: Add a toolbar button to create/remove an exercise
17
+ input_type: checkbox
18
+ default: true
19
+
20
+ - name: use_hotkey
21
+ description: Add a keyboard shortcut to create/remove an exercise
22
+ input_type: checkbox
23
+ default: true
24
+
25
+ - name: hotkey
26
+ description: Keyboard shortcut optionally used to create/remove an exercise
27
+ input_type: hotkey
28
+ default: 'Alt-D'
.local/share/jupyter/nbextensions/exercise/main.css ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .highlight-mask
2
+ {
3
+ background: transparent url('../images/theme/transBlack75.png') repeat 0 0;
4
+ display: none;
5
+ position: absolute;
6
+ }
7
+ .highlight-drag
8
+ {
9
+ background-color: transparent;
10
+ border: dashed #ff3333 3px;
11
+ position: absolute;
12
+ display: none;
13
+ }
.local/share/jupyter/nbextensions/exercise/readme.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Exercise
2
+ ========
3
+
4
+ These are two extensions for Jupyter, for hiding/showing solutions cells.
5
+ They use the same approach and codebase and differ only by the type of
6
+ `cell widget` used the show/hide the solutions. The two extensions can be used
7
+ simultaneously. They require the `rubberband` extension to be installed and
8
+ enabled.
9
+
10
+ The example below demonstrates some of the features of the exercise extensions.
11
+
12
+ - First, an solution or "details" cell is created by (a) selecting two cells with the rubberband and (b) clicking on the menu-button [exercise extension]
13
+ - Second, the two next cells are selected using a keyboard shortcut, and a solution is created using the shortcut Alt-D [exercise2 extension]
14
+ - Third, the two solutions are expanded by clicking on the corresponding widgets
15
+ - Fourth, the solutions are removed by selecting them and clicking on the buttons in the toolbar.
16
+
17
+ ![](image.gif)
18
+
19
+
20
+ The extensions provide
21
+ ----------------------
22
+
23
+ - a menubar button
24
+ - a cell widget -- A plus/minus button in `exercise` and a sliding checkbox in `exercise2`.
25
+
26
+ The menubar button is devoted to the creation or removing of the solution. The solution consists in several consecutive cells that can be selected by the usual notebook multicell selection methods (e.g. *Shift-down* (select next) or *Shift-up* (select previous) keyboard shortcuts, or using the rubberband extension.
27
+
28
+
29
+ ### Creating a solution
30
+
31
+ Several cells being selected, pressing the menubar button adds a `cell widget` and hides the cells excepted the first one which serves as a heading cell. *Do not forget to keep the Shift key pressed down while clicking on the menu button
32
+ (otherwise selected cells will be lost)*. It is also possible to use a keyboard shortcut for creating the solution from selected cells: Alt-S for exercise extension and Alt-D for exercise2.
33
+
34
+
35
+ ### Removing a solution
36
+
37
+ If a solution heading (first) cell is selected, then clicking the menu bar button removes this solution and its solutions cells are shown. Using the keyboard shortcut has the same effect.
38
+
39
+
40
+ ### Showing/hiding solution
41
+
42
+ At creation of the solution, the solution cells are hidden. Clicking the `cell widget` toggles the hidden/shown state of the solution.
43
+
44
+
45
+ ### Persistence
46
+
47
+ The state of solutions, hidden or shown, is preserved and automatically restored at startup and on reload.
48
+
49
+
50
+ ### Internals
51
+
52
+ exercise and exercise2 add respectively a solution and solution2 metadata to solution cells, with for value the current state hidden/shown of the solution. For exercise, a div with the plus/minus character is prepended to the solution heading cell. For exercise2, a flex-wrap style is added to the solution heading cell and a checkbox widget, with some css styling, is appended to the cell. A solution[.2]_first metadada is also added to enable an easy detection of the first cell in an "exercise" and then allow several consecutive exercises.
.local/share/jupyter/nbextensions/help_panel/help_panel_ext_fullscreen.png ADDED
.local/share/jupyter/nbextensions/help_panel/readme.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Help Panel
2
+ ===========
3
+
4
+ Installing the extension adds a new button to the toolbar:
5
+
6
+ ![](icon.png)
7
+
8
+ On clicking the button, the notebook width is reduced and a side panel is displayed showing help.
9
+ The contents of the help panel are exactly the same as when going to `Keyboard Shortcuts` in the `Help` menu.
10
+
11
+ ![](help_panel_ext.png)
12
+
13
+ You can drag the sidebar divider to resize it, or click the expand icon at the top left of the bar to get the help panel to expand to fill the screen:
14
+
15
+ ![](help_panel_ext_fullscreen.png)
.local/share/jupyter/nbextensions/hide_input/hide-input.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Type: IPython Notebook Extension
2
+ Compatibility: 3.x, 4.x, 5.x
3
+ Main: main.js
4
+ Name: Hide input
5
+ Icon: icon.png
6
+ Description: "toggle display of selected code cell's input"
7
+ Link: readme.md
.local/share/jupyter/nbextensions/hide_input/main.js ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Adds a button to hide the input part of the currently selected cells
2
+
3
+ define([
4
+ 'jquery',
5
+ 'base/js/namespace',
6
+ 'base/js/events'
7
+ ], function(
8
+ $,
9
+ Jupyter,
10
+ events
11
+ ) {
12
+ "use strict";
13
+
14
+ var toggle_selected_input = function () {
15
+ // Find the selected cell
16
+ var cell = Jupyter.notebook.get_selected_cell();
17
+ // Toggle visibility of the input div
18
+ cell.element.find("div.input").toggle('slow');
19
+ cell.metadata.hide_input = ! cell.metadata.hide_input;
20
+ };
21
+
22
+ var update_input_visibility = function () {
23
+ Jupyter.notebook.get_cells().forEach(function(cell) {
24
+ if (cell.metadata.hide_input) {
25
+ cell.element.find("div.input").hide();
26
+ }
27
+ })
28
+ };
29
+
30
+ var load_ipython_extension = function() {
31
+
32
+ // Add a button to the toolbar
33
+ $(Jupyter.toolbar.add_buttons_group([
34
+ Jupyter.keyboard_manager.actions.register({
35
+ help : 'Toggle selected cell input display',
36
+ icon : 'fa-chevron-up',
37
+ handler: function() {
38
+ toggle_selected_input();
39
+ setTimeout(function() { $('#btn-hide-input').blur(); }, 500);
40
+ }
41
+ }, 'toggle-cell-input-display', 'hide_input')
42
+ ])).find('.btn').attr('id', 'btn-hide-input');
43
+ // Collapse all cells that are marked as hidden
44
+ if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
45
+ // notebook already loaded. Update directly
46
+ update_input_visibility();
47
+ }
48
+ events.on("notebook_loaded.Notebook", update_input_visibility);
49
+ };
50
+
51
+ return {
52
+ load_ipython_extension : load_ipython_extension
53
+ };
54
+ });
.local/share/jupyter/nbextensions/hide_input_all/hide_input_all_hide.png ADDED
.local/share/jupyter/nbextensions/hide_input_all/icon.png ADDED
.local/share/jupyter/nbextensions/hide_input_all/main.js ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // toggle display of all code cells' inputs
2
+
3
+ define([
4
+ 'jquery',
5
+ 'base/js/namespace',
6
+ 'base/js/events'
7
+ ], function(
8
+ $,
9
+ Jupyter,
10
+ events
11
+ ) {
12
+ "use strict";
13
+
14
+ function set_input_visible(show) {
15
+ Jupyter.notebook.metadata.hide_input = !show;
16
+
17
+ if (show) $('div.input').show('slow');
18
+ else $('div.input').hide('slow');
19
+
20
+ var btn = $('#toggle_codecells');
21
+ btn.toggleClass('active', !show);
22
+
23
+ var icon = btn.find('i');
24
+ icon.toggleClass('fa-eye', show);
25
+ icon.toggleClass('fa-eye-slash', !show);
26
+ $('#toggle_codecells').attr(
27
+ 'title', (show ? 'Hide' : 'Show') + ' codecell inputs');
28
+ }
29
+
30
+ function toggle() {
31
+ set_input_visible($('#toggle_codecells').hasClass('active'));
32
+ }
33
+
34
+ function initialize () {
35
+ set_input_visible(Jupyter.notebook.metadata.hide_input !== true);
36
+ }
37
+
38
+ var load_ipython_extension = function() {
39
+ $(Jupyter.toolbar.add_buttons_group([
40
+ Jupyter.keyboard_manager.actions.register({
41
+ help : 'Hide codecell inputs',
42
+ icon : 'fa-eye',
43
+ handler: function() {
44
+ toggle();
45
+ setTimeout(function() { $('#toggle_codecells').blur(); }, 500);
46
+ }
47
+ }, 'hide-codecell-inputs', 'hide_input_all'),
48
+ ])).find('.btn').attr('id', 'toggle_codecells');
49
+ if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
50
+ // notebook_loaded.Notebook event has already happened
51
+ initialize();
52
+ }
53
+ events.on('notebook_loaded.Notebook', initialize);
54
+ };
55
+
56
+ return {
57
+ load_ipython_extension : load_ipython_extension
58
+ };
59
+ });
.local/share/jupyter/nbextensions/highlight_selected_word/README.md ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Highlight selected word
2
+ =======================
3
+
4
+ [![Join the chat at https://gitter.im/jcb91/jupyter_highlight_selected_word](https://badges.gitter.im/jcb91/jupyter_highlight_selected_word.svg)](https://gitter.im/jcb91/jupyter_highlight_selected_word?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
5
+ [![GitHub issues](https://img.shields.io/github/issues/jcb91/jupyter_highlight_selected_word.svg?maxAge=3600)](https://github.com/jcb91/jupyter_highlight_selected_word/issues)
6
+
7
+
8
+ This nbextension highlights all instances of the selected word in either the
9
+ current cell's editor, or in all cells in the notebook.
10
+ It is based on the CodeMirror addon
11
+ [Match Highlighter](https://codemirror.net/demo/matchhighlighter.html),
12
+ but now uses its own codebase in order to permit matching across multiple
13
+ editors.
14
+
15
+ There are a few configurable [options](#Options), all of which sit under the
16
+ config key `highlight_selected_word` in the `notebook` config section.
17
+
18
+
19
+ Options
20
+ -------
21
+
22
+ Options are stored in the notebook section of the nbconfig.
23
+ The easiest way to configure these is using the
24
+ [jupyter_nbextensions_configurator](https://github.com/Jupyter-contrib/jupyter_nbextensions_configurator)
25
+ serverextension, but you can also configure them directly with a few lines of
26
+ python.
27
+
28
+ The available options are:
29
+
30
+ * `highlight_selected_word.highlight_across_all_cells` - if `true`, (default)
31
+ highlight matches across all cells. If `false`, only matches within the
32
+ currently selected cell will be highlighted.
33
+
34
+ * `highlight_selected_word.code_cells_only` - Only apply highlights to editors
35
+ for Code cells, not, for example, Markdown or Raw cells
36
+
37
+ * `highlight_selected_word.highlight_color` - Color used to highlight matching
38
+ words in the focused (active) cell
39
+
40
+ * `highlight_selected_word.highlight_color_blurred` - Color used to highlight
41
+ matching words in blurred (non-active) cells
42
+
43
+ * `highlight_selected_word.outlines_only` - Highlight words using just an
44
+ outline, rather than the background color. In contrast to the default
45
+ background-color highlight, the outline-only is also applied to the
46
+ currently-selected word
47
+
48
+ * `highlight_selected_word.outline_width` - Width, in pixels, of the outline
49
+ used to highlight words when the outline-only setting (above) is selected.
50
+ Defaults to 1.
51
+
52
+ * `highlight_selected_word.delay` - Wait time (in milliseconds) before
53
+ highlighting the matches
54
+
55
+ * `highlight_selected_word.words_only` - If true, only highlight matches if the
56
+ selected text is a word
57
+
58
+ * `highlight_selected_word.highlight_only_whole_words` - Only highlight matches
59
+ which are surrounded by non-word characters. This will use the token
60
+ `highlight_selected_word.show_token` to identify word characters, if it's
61
+ set, otherwise the regular expression `[\w$]` will be used.
62
+
63
+ * `highlight_selected_word.show_token` - Token (regex) to identify word
64
+ characters, used to determine what to highlight when nothing is selected.
65
+ If blank, nothing is highlighted when nothing is selected.
66
+ This regex is also used to determine word boundaries for
67
+ `highlight_selected_word.highlight_only_whole_words`.
68
+
69
+ * `highlight_selected_word.min_chars` - Minimum number of characters that must
70
+ be selected for the highlighting behavior to occur
71
+
72
+ * `highlight_selected_word.use_toggle_hotkey` - Bind the
73
+ `highlight_selected_word.toggle` action to a hotkey. Defaults to `false`.
74
+
75
+ * `highlight_selected_word.toggle_hotkey` - Which hotkey to bind to the
76
+ `highlight_selected_word.toggle` action (if set to use, see item above).
77
+ Defaults to `alt-h`
78
+
79
+ * `highlight_selected_word.only_cells_in_scroll` - Only apply highlights to
80
+ editors which are visible in the scrolled view. This may offer performance
81
+ benefits for larger notebooks, but may be annoying if you're doing a lot of
82
+ scrolling :/
83
+
84
+ * `highlight_selected_word.scroll_min_delay` - Minimum delay in ms between
85
+ updating highlights on scrolling the notebook (used only if
86
+ `highlight_selected_word.only_cells_in_scroll` is `true`).
87
+ If set to zero, no update is done on scroll.
88
+
89
+ * `highlight_selected_word.hide_selections_in_unfocussed` - Hide any text
90
+ selection in non-focussed cells. Otherwise, each cell can show a text
91
+ selection even when its editor is not focussed, which can be confused with
92
+ match highlights.
93
+
94
+ For example, to set the delay to half a second, and limit highlighting to code
95
+ cells, we can use the following python snippet:
96
+
97
+ ```python
98
+ from notebook.services.config import ConfigManager
99
+ cm = ConfigManager()
100
+ cm.update('notebook', {'highlight_selected_word': {
101
+ 'delay': 500,
102
+ 'code_cells_only': True,
103
+ }})
104
+ ```
105
+
106
+
107
+ Feedback
108
+ --------
109
+
110
+ If you have any feedback, or have any problems, please let me know by
111
+ [opening an issue](https://github.com/jcb91/jupyter_highlight_selected_word/issues/new)
112
+ at the project's
113
+ [github repository](https://github.com/jcb91/jupyter_highlight_selected_word).
114
+
115
+ Thanks!
116
+
117
+ Josh.
.local/share/jupyter/nbextensions/highlight_selected_word/configurator.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Type: Jupyter Notebook Extension
2
+ Compatibility: 4.x, 5.x
3
+ Name: Highlight selected word
4
+ Main: main.js
5
+ Description: Enables the CodeMirror addon "Match Highlighter"
6
+ Link: README.md
7
+ Parameters:
8
+
9
+ - name: highlight_selected_word.enable_on_load
10
+ input_type: checkbox
11
+ default: true
12
+ description: |
13
+ Enable highlighting on loading the notebook interface.
14
+ The highlighting can also be toggled from the view menu
15
+
16
+ - name: highlight_selected_word.highlight_across_all_cells
17
+ input_type: checkbox
18
+ default: true
19
+ description: |
20
+ Highlight matches across all cells. If false, only matches within the
21
+ currently selected cell will be highlighted.
22
+
23
+ - name: highlight_selected_word.code_cells_only
24
+ input_type: checkbox
25
+ default: false
26
+ description: |
27
+ Only apply highlights to editors for Code cells, not, for example, Markdown
28
+ or Raw cells
29
+
30
+ - name: highlight_selected_word.highlight_color
31
+ input_type: color
32
+ default: '#90EE90'
33
+ description: Color used to highlight matching words in the focussed cell
34
+
35
+ - name: highlight_selected_word.highlight_color_blurred
36
+ input_type: color
37
+ default: '#BBFFBB'
38
+ description: Color used to highlight matching words in blurred (non-active) cells
39
+
40
+ - name: highlight_selected_word.outlines_only
41
+ input_type: checkbox
42
+ default: false
43
+ description: |
44
+ Highlight words using just an outline, rather than the background color
45
+
46
+ - name: highlight_selected_word.outline_width
47
+ input_type: number
48
+ default: 1
49
+ min: 0.5
50
+ step: 0.5
51
+ description: |
52
+ Width, in pixels, of the outline used to highlight words when the
53
+ outline-only setting is selected.
54
+
55
+ - name: highlight_selected_word.delay
56
+ input_type: number
57
+ default: 100
58
+ min: 0
59
+ step: 1
60
+ description: 'Wait time, in milliseconds, before highlighting the matches'
61
+
62
+ - name: highlight_selected_word.words_only
63
+ input_type: checkbox
64
+ default: false
65
+ description: Only highlight matches if the selected text is a whole word
66
+
67
+ - name: highlight_selected_word.highlight_only_whole_words
68
+ input_type: checkbox
69
+ default: true
70
+ description: |
71
+ Only highlight matches when they are surrounded by non-word characters, as
72
+ determined by the token below (if set), or the default regex '[\w$]'.
73
+
74
+ - name: highlight_selected_word.show_token
75
+ input_type: text
76
+ default: '[\w$]' # single-quote strings in yaml are like python raw strings
77
+ description: |
78
+ Token (regex) to identify word characters, used to determine what to
79
+ highlight when nothing is selected. If blank, nothing is highlighted when
80
+ nothing is selected.
81
+
82
+ - name: highlight_selected_word.min_chars
83
+ input_type: number
84
+ default: 2
85
+ min: 0
86
+ step: 1
87
+ description: |
88
+ Minimum number of characters that must be selected for the highlighting
89
+ to occur (assuming no token is set for use when nothing is selected)
90
+
91
+ - name: highlight_selected_word.trim
92
+ input_type: checkbox
93
+ default: true
94
+ description: |
95
+ Trim whitespace from selection text before checking for minimum length
96
+
97
+ - name: highlight_selected_word.use_toggle_hotkey
98
+ input_type: checkbox
99
+ default: false
100
+ description: |
101
+ Bind the highlight_selected_word:toggle action to a hotkey
102
+
103
+ - name: highlight_selected_word.toggle_hotkey
104
+ input_type: hotkey
105
+ default: 'alt-h'
106
+ description: |
107
+ Hotkey to bind to the highlight_selected_word:toggle action (if selected
108
+ for use, above)
109
+
110
+ - name: highlight_selected_word.only_cells_in_scroll
111
+ input_type: checkbox
112
+ default: true
113
+ description: |
114
+ Only apply highlights to editors which are visible in the scrolled view.
115
+ This may offer performance benefits for larger notebooks
116
+
117
+ - name: highlight_selected_word.scroll_min_delay
118
+ input_type: number
119
+ default: 100
120
+ min: 0
121
+ step: 10
122
+ description: |
123
+ Minimum delay in ms between updating highlights on scrolling the notebook
124
+ (used only if limiting highlights to those in scrolled view, see above).
125
+ If set to zero, no update is done on scroll.
126
+
127
+ - name: highlight_selected_word.hide_selections_in_unfocussed
128
+ input_type: checkbox
129
+ default: false
130
+ description: |
131
+ Hide any text selection in non-focussed cells (can be confused with match highlights).
.local/share/jupyter/nbextensions/highlighter/demo_highlighter.ipynb ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "\n",
8
+ "## The highlighter extension:\n",
9
+ "\n",
10
+ "- Firstable, the extension provides <span class=\"mark\">several toolbar buttons</span> for highlighting a selected text _within a markdown cell_. Three different \\`color schemes' are provided, which can be easily customized in the \\textit{stylesheet} `highlighter.css`. The last button enables to remove all highlightings in the current cell. \n",
11
+ "- This works both <span class=\"burk\">when the cell is _rendered_ and when the cell is in edit mode</span>; \n",
12
+ "- In both modes, it is possible to highlight formatted portions of text (In rendered mode, since the selected text loose its formatting, an heuristic is applied to find the best alignment with the actual text)\n",
13
+ "- When no text is selected, the whole cell is highlighted; \n",
14
+ "- The extension also provides two keyboard shortcuts (Alt-G and Alt-H) which fire the highlighting of the selected text. \n",
15
+ "- Highlights can be preserved when exporting to html or to LaTeX -- details are provided in [export_highlights](export_highlights.ipynb)\n",
16
+ "\n",
17
+ "\n",
18
+ "![](image.gif)\n",
19
+ "\n",
20
+ "## Installation:\n",
21
+ "\n",
22
+ "The extension can be installed with the nice UI available on jupyter_contrib_nbextensions website, which also allows to enable/disable the extension. \n",
23
+ "\n",
24
+ "You may also install the extension from the original repo: issue\n",
25
+ "```bash\n",
26
+ "jupyter nbextension install https://rawgit.com/jfbercher/small_nbextensions/master/highlighter.zip --user\n",
27
+ "\n",
28
+ "```\n",
29
+ "at the command line.\n",
30
+ "\n",
31
+ "### Testing: \n",
32
+ "\n",
33
+ "Use a code cell with\n",
34
+ "```javascript\n",
35
+ "%%javascript\n",
36
+ "require(\"base/js/utils\").load_extensions(\"highlighter/highlighter\")\n",
37
+ "```\n",
38
+ "\n",
39
+ "### Automatic load\n",
40
+ "You may also automatically load the extension for any notebook via\n",
41
+ "```bash\n",
42
+ "jupyter nbextension enable highlighter/highlighter\t\n",
43
+ "```\n"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 2,
49
+ "metadata": {
50
+ "collapsed": false
51
+ },
52
+ "outputs": [
53
+ {
54
+ "data": {
55
+ "application/javascript": [
56
+ "require(\"base/js/utils\").load_extensions(\"highlighter/highlighter\")"
57
+ ],
58
+ "text/plain": [
59
+ "<IPython.core.display.Javascript object>"
60
+ ]
61
+ },
62
+ "metadata": {},
63
+ "output_type": "display_data"
64
+ }
65
+ ],
66
+ "source": [
67
+ "%%javascript\n",
68
+ "require(\"base/js/utils\").load_extensions(\"highlighter/highlighter\")"
69
+ ]
70
+ }
71
+ ],
72
+ "metadata": {
73
+ "interactive_sols": {
74
+ "cbx_id": 1
75
+ },
76
+ "kernelspec": {
77
+ "display_name": "Python 3",
78
+ "language": "python",
79
+ "name": "python3"
80
+ },
81
+ "language_info": {
82
+ "codemirror_mode": {
83
+ "name": "ipython",
84
+ "version": 3
85
+ },
86
+ "file_extension": ".py",
87
+ "mimetype": "text/x-python",
88
+ "name": "python",
89
+ "nbconvert_exporter": "python",
90
+ "pygments_lexer": "ipython3",
91
+ "version": "3.4.3+"
92
+ },
93
+ },
94
+ "nbformat": 4,
95
+ "nbformat_minor": 0
96
+ }
.local/share/jupyter/nbextensions/keyboard_shortcut_editor/icon.png ADDED
.local/share/jupyter/nbextensions/keyboard_shortcut_editor/readme_undefined_key.png ADDED
.local/share/jupyter/nbextensions/load_tex_macros/main.js ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ define(function(require, exports, module) {
2
+ var Jupyter = require('base/js/namespace');
3
+
4
+ function loadLatexUserDefs() {
5
+ $.get('latexdefs.tex').done(function(data) {
6
+ data = data.replace(/^/gm, '\$\$\$').replace(/$/gm, '\$\$\$');
7
+ if ($('#latexdefs').length > 0) $('#latexdefs').remove();
8
+ $('body').append($('<div/>').attr('id', 'latexdefs').text(data));
9
+ console.log('latex_envs: loaded user LaTeX definitions latexdefs.tex');
10
+ }).fail(function() {
11
+ console.log('load_tex_macros: failed to load user LaTeX definitions latexdefs.tex')
12
+ });
13
+ }
14
+
15
+ function rerenderMaths() { // probably something like that
16
+ MathJax.Hub.Queue(
17
+ ["resetEquationNumbers",MathJax.InputJax.TeX],
18
+ ["PreProcess", MathJax.Hub],
19
+ ["Reprocess", MathJax.Hub]
20
+ );
21
+ }
22
+
23
+ function load_ipython_extension() {
24
+ "use strict";
25
+
26
+ if (Jupyter.notebook._fully_loaded) {
27
+ loadLatexUserDefs();
28
+ rerenderMaths();
29
+ } else {
30
+ $([Jupyter.events]).on("notebook_loaded.Notebook", function() {
31
+ loadLatexUserDefs();
32
+ rerenderMaths();
33
+ })
34
+ }
35
+ }
36
+ return {
37
+ load_ipython_extension: load_ipython_extension,
38
+ };
39
+ })
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin ADDED
Binary file (13.3 kB). View file
 
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4e
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4e(
13
+ .param .u64 triton__0d1d2d3de4e_param_0,
14
+ .param .u64 triton__0d1d2d3de4e_param_1,
15
+ .param .u64 triton__0d1d2d3de4e_param_2,
16
+ .param .u32 triton__0d1d2d3de4e_param_3,
17
+ .param .u32 triton__0d1d2d3de4e_param_4
18
+ )
19
+ .maxntid 128, 1, 1
20
+ {
21
+ .reg .pred %p<15>;
22
+ .reg .b32 %r<91>;
23
+ .reg .f32 %f<62>;
24
+ .reg .b64 %rd<16>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd5, [triton__0d1d2d3de4e_param_0];
30
+ ld.param.u64 %rd6, [triton__0d1d2d3de4e_param_1];
31
+ $L__tmp0:
32
+ .loc 1 22 44
33
+ mov.u32 %r24, %tid.x;
34
+ and.b32 %r25, %r24, 31;
35
+ ld.param.u64 %rd7, [triton__0d1d2d3de4e_param_2];
36
+ and.b32 %r26, %r24, 3;
37
+ .loc 1 24 33
38
+ bfe.u32 %r27, %r24, 5, 2;
39
+ and.b32 %r28, %r24, 127;
40
+ .loc 1 21 28
41
+ mov.u32 %r1, %ctaid.x;
42
+ .loc 1 21 33
43
+ shl.b32 %r29, %r1, 2;
44
+ .loc 1 22 23
45
+ or.b32 %r30, %r29, %r26;
46
+ .loc 1 29 25
47
+ setp.lt.u32 %p1, %r28, 120;
48
+ .loc 1 31 47
49
+ shl.b32 %r31, %r28, 17;
50
+ .loc 1 31 40
51
+ add.s32 %r32, %r29, %r31;
52
+ .loc 1 31 34
53
+ mul.wide.s32 %rd8, %r32, 4;
54
+ add.s64 %rd1, %rd5, %rd8;
55
+ mov.b32 %r6, 0;
56
+ .loc 1 31 53
57
+ mov.u32 %r2, 0x0;
58
+ mov.u32 %r3, 0x0;
59
+ mov.u32 %r4, 0x0;
60
+ mov.u32 %r5, 0x0;
61
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
62
+ @!%p1 mov.u32 %r2, %r6;
63
+ @!%p1 mov.u32 %r3, %r6;
64
+ @!%p1 mov.u32 %r4, %r6;
65
+ @!%p1 mov.u32 %r5, %r6;
66
+ mov.b32 %f1, %r2;
67
+ mov.b32 %f2, %r3;
68
+ mov.b32 %f3, %r4;
69
+ mov.b32 %f4, %r5;
70
+ .loc 1 33 23
71
+ add.f32 %f5, %f1, 0f00000000;
72
+ add.f32 %f6, %f2, 0f00000000;
73
+ add.f32 %f7, %f3, 0f00000000;
74
+ add.f32 %f8, %f4, 0f00000000;
75
+ .loc 1 34 38
76
+ selp.f32 %f9, %f5, 0f00000000, %p1;
77
+ selp.f32 %f10, %f6, 0f00000000, %p1;
78
+ selp.f32 %f11, %f7, 0f00000000, %p1;
79
+ selp.f32 %f12, %f8, 0f00000000, %p1;
80
+ $L__tmp1:
81
+ .loc 2 243 36
82
+ mov.b32 %r33, %f9;
83
+ shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1;
84
+ mov.b32 %f13, %r34;
85
+ $L__tmp2:
86
+ .loc 2 233 15
87
+ add.f32 %f14, %f9, %f13;
88
+ $L__tmp3:
89
+ .loc 2 243 36
90
+ mov.b32 %r35, %f14;
91
+ shfl.sync.bfly.b32 %r36, %r35, 8, 31, -1;
92
+ mov.b32 %f15, %r36;
93
+ $L__tmp4:
94
+ .loc 2 233 15
95
+ add.f32 %f16, %f14, %f15;
96
+ $L__tmp5:
97
+ .loc 2 243 36
98
+ mov.b32 %r37, %f16;
99
+ shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1;
100
+ mov.b32 %f17, %r38;
101
+ $L__tmp6:
102
+ .loc 2 233 15
103
+ add.f32 %f18, %f16, %f17;
104
+ $L__tmp7:
105
+ .loc 2 243 36
106
+ mov.b32 %r39, %f18;
107
+ shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
108
+ mov.b32 %f19, %r40;
109
+ $L__tmp8:
110
+ .loc 2 233 15
111
+ add.f32 %f20, %f18, %f19;
112
+ $L__tmp9:
113
+ .loc 2 243 36
114
+ mov.b32 %r41, %f20;
115
+ shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1;
116
+ mov.b32 %f21, %r42;
117
+ $L__tmp10:
118
+ .loc 2 233 15
119
+ add.f32 %f22, %f20, %f21;
120
+ $L__tmp11:
121
+ .loc 2 243 36
122
+ mov.b32 %r43, %f10;
123
+ shfl.sync.bfly.b32 %r44, %r43, 16, 31, -1;
124
+ mov.b32 %f23, %r44;
125
+ $L__tmp12:
126
+ .loc 2 233 15
127
+ add.f32 %f24, %f10, %f23;
128
+ $L__tmp13:
129
+ .loc 2 243 36
130
+ mov.b32 %r45, %f24;
131
+ shfl.sync.bfly.b32 %r46, %r45, 8, 31, -1;
132
+ mov.b32 %f25, %r46;
133
+ $L__tmp14:
134
+ .loc 2 233 15
135
+ add.f32 %f26, %f24, %f25;
136
+ $L__tmp15:
137
+ .loc 2 243 36
138
+ mov.b32 %r47, %f26;
139
+ shfl.sync.bfly.b32 %r48, %r47, 4, 31, -1;
140
+ mov.b32 %f27, %r48;
141
+ $L__tmp16:
142
+ .loc 2 233 15
143
+ add.f32 %f28, %f26, %f27;
144
+ $L__tmp17:
145
+ .loc 2 243 36
146
+ mov.b32 %r49, %f28;
147
+ shfl.sync.bfly.b32 %r50, %r49, 2, 31, -1;
148
+ mov.b32 %f29, %r50;
149
+ $L__tmp18:
150
+ .loc 2 233 15
151
+ add.f32 %f30, %f28, %f29;
152
+ $L__tmp19:
153
+ .loc 2 243 36
154
+ mov.b32 %r51, %f30;
155
+ shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1;
156
+ mov.b32 %f31, %r52;
157
+ $L__tmp20:
158
+ .loc 2 233 15
159
+ add.f32 %f32, %f30, %f31;
160
+ $L__tmp21:
161
+ .loc 2 243 36
162
+ mov.b32 %r53, %f11;
163
+ shfl.sync.bfly.b32 %r54, %r53, 16, 31, -1;
164
+ mov.b32 %f33, %r54;
165
+ $L__tmp22:
166
+ .loc 2 233 15
167
+ add.f32 %f34, %f11, %f33;
168
+ $L__tmp23:
169
+ .loc 2 243 36
170
+ mov.b32 %r55, %f34;
171
+ shfl.sync.bfly.b32 %r56, %r55, 8, 31, -1;
172
+ mov.b32 %f35, %r56;
173
+ $L__tmp24:
174
+ .loc 2 233 15
175
+ add.f32 %f36, %f34, %f35;
176
+ $L__tmp25:
177
+ .loc 2 243 36
178
+ mov.b32 %r57, %f36;
179
+ shfl.sync.bfly.b32 %r58, %r57, 4, 31, -1;
180
+ mov.b32 %f37, %r58;
181
+ $L__tmp26:
182
+ .loc 2 233 15
183
+ add.f32 %f38, %f36, %f37;
184
+ $L__tmp27:
185
+ .loc 2 243 36
186
+ mov.b32 %r59, %f38;
187
+ shfl.sync.bfly.b32 %r60, %r59, 2, 31, -1;
188
+ mov.b32 %f39, %r60;
189
+ $L__tmp28:
190
+ .loc 2 233 15
191
+ add.f32 %f40, %f38, %f39;
192
+ $L__tmp29:
193
+ .loc 2 243 36
194
+ mov.b32 %r61, %f40;
195
+ shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1;
196
+ mov.b32 %f41, %r62;
197
+ $L__tmp30:
198
+ .loc 2 233 15
199
+ add.f32 %f42, %f40, %f41;
200
+ $L__tmp31:
201
+ .loc 2 243 36
202
+ mov.b32 %r63, %f12;
203
+ shfl.sync.bfly.b32 %r64, %r63, 16, 31, -1;
204
+ mov.b32 %f43, %r64;
205
+ $L__tmp32:
206
+ .loc 2 233 15
207
+ add.f32 %f44, %f12, %f43;
208
+ $L__tmp33:
209
+ .loc 2 243 36
210
+ mov.b32 %r65, %f44;
211
+ shfl.sync.bfly.b32 %r66, %r65, 8, 31, -1;
212
+ mov.b32 %f45, %r66;
213
+ $L__tmp34:
214
+ .loc 2 233 15
215
+ add.f32 %f46, %f44, %f45;
216
+ $L__tmp35:
217
+ .loc 2 243 36
218
+ mov.b32 %r67, %f46;
219
+ shfl.sync.bfly.b32 %r68, %r67, 4, 31, -1;
220
+ mov.b32 %f47, %r68;
221
+ $L__tmp36:
222
+ .loc 2 233 15
223
+ add.f32 %f48, %f46, %f47;
224
+ $L__tmp37:
225
+ .loc 2 243 36
226
+ mov.b32 %r69, %f48;
227
+ shfl.sync.bfly.b32 %r70, %r69, 2, 31, -1;
228
+ mov.b32 %f49, %r70;
229
+ $L__tmp38:
230
+ .loc 2 233 15
231
+ add.f32 %f50, %f48, %f49;
232
+ $L__tmp39:
233
+ .loc 2 243 36
234
+ mov.b32 %r71, %f50;
235
+ shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1;
236
+ mov.b32 %f51, %r72;
237
+ $L__tmp40:
238
+ .loc 2 233 15
239
+ add.f32 %f52, %f50, %f51;
240
+ $L__tmp41:
241
+ .loc 2 243 36
242
+ setp.eq.s32 %p6, %r25, 0;
243
+ shl.b32 %r73, %r27, 2;
244
+ mov.u32 %r74, global_smem;
245
+ add.s32 %r10, %r74, %r73;
246
+ mov.b32 %r11, %f22;
247
+ @%p6 st.shared.b32 [ %r10 + 0 ], %r11;
248
+ add.s32 %r12, %r10, 16;
249
+ mov.b32 %r13, %f32;
250
+ @%p6 st.shared.b32 [ %r12 + 0 ], %r13;
251
+ add.s32 %r14, %r10, 32;
252
+ mov.b32 %r15, %f42;
253
+ @%p6 st.shared.b32 [ %r14 + 0 ], %r15;
254
+ add.s32 %r16, %r10, 48;
255
+ mov.b32 %r17, %f52;
256
+ @%p6 st.shared.b32 [ %r16 + 0 ], %r17;
257
+ bar.sync 0;
258
+ setp.lt.s32 %p10, %r24, 16;
259
+ shl.b32 %r75, %r24, 2;
260
+ add.s32 %r19, %r74, %r75;
261
+ @%p10 ld.shared.b32 %r18, [ %r19 + 0 ];
262
+ mov.b32 %f53, %r18;
263
+ shfl.sync.bfly.b32 %r76, %r18, 2, 31, -1;
264
+ mov.b32 %f54, %r76;
265
+ $L__tmp42:
266
+ .loc 2 233 15
267
+ add.f32 %f55, %f53, %f54;
268
+ $L__tmp43:
269
+ .loc 2 243 36
270
+ mov.b32 %r77, %f55;
271
+ shfl.sync.bfly.b32 %r78, %r77, 1, 31, -1;
272
+ mov.b32 %f56, %r78;
273
+ $L__tmp44:
274
+ .loc 2 233 15
275
+ add.f32 %f57, %f55, %f56;
276
+ $L__tmp45:
277
+ .loc 2 243 36
278
+ setp.eq.s32 %p14, %r26, 0;
279
+ and.pred %p11, %p10, %p14;
280
+ mov.b32 %r21, %f57;
281
+ @%p11 st.shared.b32 [ %r19 + 0 ], %r21;
282
+ bar.sync 0;
283
+ ld.shared.f32 %f58, [global_smem];
284
+ ld.shared.f32 %f59, [global_smem+16];
285
+ ld.shared.f32 %f60, [global_smem+32];
286
+ ld.shared.f32 %f61, [global_smem+48];
287
+ $L__tmp46:
288
+ .loc 1 35 28
289
+ bar.sync 0;
290
+ st.shared.f32 [global_smem], %f58;
291
+ st.shared.f32 [global_smem+4], %f59;
292
+ st.shared.f32 [global_smem+8], %f60;
293
+ st.shared.f32 [global_smem+12], %f61;
294
+ bar.sync 0;
295
+ shl.b32 %r79, %r26, 2;
296
+ add.s32 %r80, %r74, %r79;
297
+ .loc 1 36 20
298
+ shr.s32 %r82, %r30, 31;
299
+ shr.u32 %r83, %r82, 24;
300
+ add.s32 %r84, %r30, %r83;
301
+ shr.s32 %r85, %r84, 8;
302
+ and.b32 %r86, %r84, -256;
303
+ sub.s32 %r87, %r30, %r86;
304
+ .loc 1 38 30
305
+ mul.wide.s32 %rd9, %r85, 8;
306
+ add.s64 %rd3, %rd6, %rd9;
307
+ .loc 1 45 55
308
+ ld.shared.u32 %r23, [%r80];
309
+ mov.pred %p12, -1;
310
+ .loc 1 38 35
311
+ mov.u64 %rd2, 0x0;
312
+ @%p12 ld.global.L1::evict_last.b64 { %rd2 }, [ %rd3 + 0 ];
313
+ .loc 1 41 32
314
+ shr.u64 %rd10, %rd2, 54;
315
+ and.b64 %rd11, %rd10, 512;
316
+ add.s64 %rd12, %rd11, %rd2;
317
+ .loc 1 45 30
318
+ shl.b64 %rd13, %rd12, 10;
319
+ add.s64 %rd14, %rd7, %rd13;
320
+ mul.wide.s32 %rd15, %r87, 4;
321
+ add.s64 %rd4, %rd14, %rd15;
322
+ .loc 1 45 55
323
+ bfe.u32 %r88, %r24, 2, 3;
324
+ shl.b32 %r89, %r27, 3;
325
+ or.b32 %r90, %r89, %r88;
326
+ setp.eq.s32 %p13, %r90, 0;
327
+ mov.u32 %r22, 0x0;
328
+ @%p13 atom.global.gpu.acq_rel.add.f32 %r22, [ %rd4 + 0 ], %r23;
329
+ .loc 1 45 4
330
+ ret;
331
+ $L__tmp47:
332
+ $L__func_end0:
333
+
334
+ }
335
+ .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
336
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
337
+ .section .debug_abbrev
338
+ {
339
+ .b8 1
340
+ .b8 17
341
+ .b8 1
342
+ .b8 37
343
+ .b8 8
344
+ .b8 19
345
+ .b8 5
346
+ .b8 3
347
+ .b8 8
348
+ .b8 16
349
+ .b8 6
350
+ .b8 27
351
+ .b8 8
352
+ .b8 180
353
+ .b8 66
354
+ .b8 12
355
+ .b8 17
356
+ .b8 1
357
+ .b8 18
358
+ .b8 1
359
+ .b8 0
360
+ .b8 0
361
+ .b8 2
362
+ .b8 46
363
+ .b8 0
364
+ .b8 135
365
+ .b8 64
366
+ .b8 8
367
+ .b8 3
368
+ .b8 8
369
+ .b8 58
370
+ .b8 11
371
+ .b8 59
372
+ .b8 11
373
+ .b8 63
374
+ .b8 12
375
+ .b8 32
376
+ .b8 11
377
+ .b8 0
378
+ .b8 0
379
+ .b8 3
380
+ .b8 46
381
+ .b8 1
382
+ .b8 17
383
+ .b8 1
384
+ .b8 18
385
+ .b8 1
386
+ .b8 64
387
+ .b8 10
388
+ .b8 49
389
+ .b8 19
390
+ .b8 0
391
+ .b8 0
392
+ .b8 4
393
+ .b8 29
394
+ .b8 0
395
+ .b8 49
396
+ .b8 19
397
+ .b8 17
398
+ .b8 1
399
+ .b8 18
400
+ .b8 1
401
+ .b8 88
402
+ .b8 11
403
+ .b8 89
404
+ .b8 11
405
+ .b8 87
406
+ .b8 11
407
+ .b8 0
408
+ .b8 0
409
+ .b8 5
410
+ .b8 29
411
+ .b8 1
412
+ .b8 49
413
+ .b8 19
414
+ .b8 17
415
+ .b8 1
416
+ .b8 18
417
+ .b8 1
418
+ .b8 88
419
+ .b8 11
420
+ .b8 89
421
+ .b8 11
422
+ .b8 87
423
+ .b8 11
424
+ .b8 0
425
+ .b8 0
426
+ .b8 0
427
+ }
428
+ .section .debug_info
429
+ {
430
+ .b32 264
431
+ .b8 2
432
+ .b8 0
433
+ .b32 .debug_abbrev
434
+ .b8 8
435
+ .b8 1
436
+ .b8 116
437
+ .b8 114
438
+ .b8 105
439
+ .b8 116
440
+ .b8 111
441
+ .b8 110
442
+ .b8 0
443
+ .b8 2
444
+ .b8 0
445
+ .b8 99
446
+ .b8 54
447
+ .b8 105
448
+ .b8 107
449
+ .b8 53
450
+ .b8 118
451
+ .b8 120
452
+ .b8 55
453
+ .b8 112
454
+ .b8 50
455
+ .b8 50
456
+ .b8 102
457
+ .b8 112
458
+ .b8 107
459
+ .b8 52
460
+ .b8 100
461
+ .b8 99
462
+ .b8 118
463
+ .b8 104
464
+ .b8 53
465
+ .b8 53
466
+ .b8 122
467
+ .b8 105
468
+ .b8 109
469
+ .b8 119
470
+ .b8 52
471
+ .b8 116
472
+ .b8 53
473
+ .b8 110
474
+ .b8 114
475
+ .b8 53
476
+ .b8 122
477
+ .b8 110
478
+ .b8 50
479
+ .b8 98
480
+ .b8 55
481
+ .b8 105
482
+ .b8 110
483
+ .b8 117
484
+ .b8 106
485
+ .b8 120
486
+ .b8 106
487
+ .b8 97
488
+ .b8 117
489
+ .b8 120
490
+ .b8 115
491
+ .b8 104
492
+ .b8 108
493
+ .b8 106
494
+ .b8 117
495
+ .b8 109
496
+ .b8 109
497
+ .b8 46
498
+ .b8 112
499
+ .b8 121
500
+ .b8 0
501
+ .b32 .debug_line
502
+ .b8 47
503
+ .b8 116
504
+ .b8 109
505
+ .b8 112
506
+ .b8 47
507
+ .b8 116
508
+ .b8 111
509
+ .b8 114
510
+ .b8 99
511
+ .b8 104
512
+ .b8 105
513
+ .b8 110
514
+ .b8 100
515
+ .b8 117
516
+ .b8 99
517
+ .b8 116
518
+ .b8 111
519
+ .b8 114
520
+ .b8 95
521
+ .b8 114
522
+ .b8 111
523
+ .b8 111
524
+ .b8 116
525
+ .b8 47
526
+ .b8 54
527
+ .b8 105
528
+ .b8 0
529
+ .b8 1
530
+ .b64 $L__func_begin0
531
+ .b64 $L__func_end0
532
+ .b8 2
533
+ .b8 116
534
+ .b8 114
535
+ .b8 105
536
+ .b8 116
537
+ .b8 111
538
+ .b8 110
539
+ .b8 95
540
+ .b8 95
541
+ .b8 48
542
+ .b8 100
543
+ .b8 49
544
+ .b8 100
545
+ .b8 50
546
+ .b8 100
547
+ .b8 51
548
+ .b8 100
549
+ .b8 101
550
+ .b8 52
551
+ .b8 101
552
+ .b8 0
553
+ .b8 116
554
+ .b8 114
555
+ .b8 105
556
+ .b8 116
557
+ .b8 111
558
+ .b8 110
559
+ .b8 95
560
+ .b8 95
561
+ .b8 48
562
+ .b8 100
563
+ .b8 49
564
+ .b8 100
565
+ .b8 50
566
+ .b8 100
567
+ .b8 51
568
+ .b8 100
569
+ .b8 101
570
+ .b8 52
571
+ .b8 101
572
+ .b8 0
573
+ .b8 1
574
+ .b8 18
575
+ .b8 1
576
+ .b8 1
577
+ .b8 3
578
+ .b64 $L__func_begin0
579
+ .b64 $L__func_end0
580
+ .b8 1
581
+ .b8 156
582
+ .b32 125
583
+ .b8 4
584
+ .b32 125
585
+ .b64 $L__tmp1
586
+ .b64 $L__tmp46
587
+ .b8 2
588
+ .b8 35
589
+ .b8 25
590
+ .b8 5
591
+ .b32 125
592
+ .b64 $L__tmp2
593
+ .b64 $L__tmp45
594
+ .b8 2
595
+ .b8 35
596
+ .b8 25
597
+ .b8 4
598
+ .b32 125
599
+ .b64 $L__tmp2
600
+ .b64 $L__tmp45
601
+ .b8 2
602
+ .b8 243
603
+ .b8 36
604
+ .b8 0
605
+ .b8 0
606
+ .b8 0
607
+ }
608
+ .section .debug_pubnames
609
+ {
610
+ .b32 $L__pubNames_end0-$L__pubNames_start0
611
+ $L__pubNames_start0:
612
+ .b8 2
613
+ .b8 0
614
+ .b32 .debug_info
615
+ .b32 268
616
+ .b32 125
617
+ .b8 116
618
+ .b8 114
619
+ .b8 105
620
+ .b8 116
621
+ .b8 111
622
+ .b8 110
623
+ .b8 95
624
+ .b8 95
625
+ .b8 48
626
+ .b8 100
627
+ .b8 49
628
+ .b8 100
629
+ .b8 50
630
+ .b8 100
631
+ .b8 51
632
+ .b8 100
633
+ .b8 101
634
+ .b8 52
635
+ .b8 101
636
+ .b8 0
637
+ .b32 0
638
+ $L__pubNames_end0:
639
+ }
640
+ .section .debug_pubtypes
641
+ {
642
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
643
+ $L__pubTypes_start0:
644
+ .b8 2
645
+ .b8 0
646
+ .b32 .debug_info
647
+ .b32 268
648
+ .b32 0
649
+ $L__pubTypes_end0:
650
+ }
651
+ .section .debug_loc { }
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<4x1xi64, #blocked>
6
+ %cst_0 = arith.constant dense<0> : tensor<4x1xi64, #blocked>
7
+ %cst_1 = arith.constant dense<512> : tensor<4x1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<256> : tensor<4x1xi32, #blocked>
9
+ %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
10
+ %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
11
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked1>
12
+ %cst_6 = arith.constant dense<true> : tensor<4x1xi1, #blocked>
13
+ %c4_i32 = arith.constant 4 : i32
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.muli %0, %c4_i32 : i32
16
+ %2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
17
+ %3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4x1xi32, #blocked1>
19
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xi32, #blocked>
20
+ %6 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked1>
21
+ %7 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked>
22
+ %8 = arith.addi %6, %4 : tensor<4x1xi32, #blocked1>
23
+ %9 = arith.addi %7, %5 : tensor<4x1xi32, #blocked>
24
+ %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
25
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
26
+ %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
27
+ %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
28
+ %14 = tt.broadcast %8 : (tensor<4x1xi32, #blocked1>) -> tensor<4x128xi32, #blocked1>
29
+ %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<4x128xi32, #blocked1>
30
+ %16 = arith.addi %14, %15 : tensor<4x128xi32, #blocked1>
31
+ %17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<4x128x!tt.ptr<f32, 1>, #blocked1>
32
+ %18 = tt.addptr %17, %16 : tensor<4x128x!tt.ptr<f32, 1>, #blocked1>, tensor<4x128xi32, #blocked1>
33
+ %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<4x128xi1, #blocked1>
34
+ %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32, #blocked1>
35
+ %21 = arith.addf %20, %cst_5 : tensor<4x128xf32, #blocked1>
36
+ %22 = arith.select %19, %21, %cst_5 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1>
37
+ %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
38
+ ^bb0(%arg5: f32, %arg6: f32):
39
+ %40 = arith.addf %arg5, %arg6 : f32
40
+ tt.reduce.return %40 : f32
41
+ }) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
42
+ %24 = triton_gpu.convert_layout %23 : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
43
+ %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xf32, #blocked>
44
+ %26 = arith.divsi %9, %cst_2 : tensor<4x1xi32, #blocked>
45
+ %27 = arith.remsi %9, %cst_2 : tensor<4x1xi32, #blocked>
46
+ %28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<4x1x!tt.ptr<i64, 1>, #blocked>
47
+ %29 = tt.addptr %28, %26 : tensor<4x1x!tt.ptr<i64, 1>, #blocked>, tensor<4x1xi32, #blocked>
48
+ %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64, #blocked>
49
+ %31 = arith.addi %30, %cst_1 : tensor<4x1xi64, #blocked>
50
+ %32 = arith.cmpi slt, %30, %cst_0 : tensor<4x1xi64, #blocked>
51
+ %33 = arith.select %32, %31, %30 : tensor<4x1xi1, #blocked>, tensor<4x1xi64, #blocked>
52
+ %34 = arith.muli %33, %cst : tensor<4x1xi64, #blocked>
53
+ %35 = arith.extsi %27 : tensor<4x1xi32, #blocked> to tensor<4x1xi64, #blocked>
54
+ %36 = arith.addi %35, %34 : tensor<4x1xi64, #blocked>
55
+ %37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<4x1x!tt.ptr<f32, 1>, #blocked>
56
+ %38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f32, 1>, #blocked>, tensor<4x1xi64, #blocked>
57
+ %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr<f32, 1>, #blocked>, tensor<4x1xf32, #blocked>, tensor<4x1xi1, #blocked>) -> tensor<4x1xf32, #blocked>
58
+ tt.return
59
+ }
60
+ }
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<256> : tensor<4x1xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<4x1xi64>
5
+ %cst_1 = arith.constant dense<512> : tensor<4x1xi64>
6
+ %cst_2 = arith.constant dense<true> : tensor<4x1xi1>
7
+ %cst_3 = arith.constant dense<256> : tensor<4x1xi32>
8
+ %cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
9
+ %cst_5 = arith.constant dense<120> : tensor<1x128xi32>
10
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<4x128xf32>
11
+ %c4_i32 = arith.constant 4 : i32
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.muli %0, %c4_i32 : i32
14
+ %2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
15
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32>) -> tensor<4x1xi32>
16
+ %4 = tt.splat %1 : (i32) -> tensor<4x1xi32>
17
+ %5 = arith.addi %4, %3 : tensor<4x1xi32>
18
+ %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
19
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
20
+ %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
21
+ %9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
22
+ %10 = tt.broadcast %5 : (tensor<4x1xi32>) -> tensor<4x128xi32>
23
+ %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<4x128xi32>
24
+ %12 = arith.addi %10, %11 : tensor<4x128xi32>
25
+ %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<4x128x!tt.ptr<f32, 1>>
26
+ %14 = tt.addptr %13, %12 : tensor<4x128x!tt.ptr<f32, 1>>, tensor<4x128xi32>
27
+ %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<4x128xi1>
28
+ %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32>
29
+ %17 = arith.addf %16, %cst_6 : tensor<4x128xf32>
30
+ %18 = arith.select %15, %17, %cst_6 : tensor<4x128xi1>, tensor<4x128xf32>
31
+ %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
32
+ ^bb0(%arg5: f32, %arg6: f32):
33
+ %35 = arith.addf %arg5, %arg6 : f32
34
+ tt.reduce.return %35 : f32
35
+ }) : (tensor<4x128xf32>) -> tensor<4xf32>
36
+ %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<4xf32>) -> tensor<4x1xf32>
37
+ %21 = arith.divsi %5, %cst_3 : tensor<4x1xi32>
38
+ %22 = arith.remsi %5, %cst_3 : tensor<4x1xi32>
39
+ %23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<4x1x!tt.ptr<i64, 1>>
40
+ %24 = tt.addptr %23, %21 : tensor<4x1x!tt.ptr<i64, 1>>, tensor<4x1xi32>
41
+ %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64>
42
+ %26 = arith.addi %25, %cst_1 : tensor<4x1xi64>
43
+ %27 = arith.cmpi slt, %25, %cst_0 : tensor<4x1xi64>
44
+ %28 = arith.select %27, %26, %25 : tensor<4x1xi1>, tensor<4x1xi64>
45
+ %29 = arith.muli %28, %cst : tensor<4x1xi64>
46
+ %30 = arith.extsi %22 : tensor<4x1xi32> to tensor<4x1xi64>
47
+ %31 = arith.addi %30, %29 : tensor<4x1xi64>
48
+ %32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<4x1x!tt.ptr<f32, 1>>
49
+ %33 = tt.addptr %32, %31 : tensor<4x1x!tt.ptr<f32, 1>>, tensor<4x1xi64>
50
+ %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr<f32, 1>>, tensor<4x1xf32>, tensor<4x1xi1>) -> tensor<4x1xf32>
51
+ tt.return
52
+ }
53
+ }
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.cubin ADDED
Binary file (7.07 kB). View file
 
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
7
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %5 = and i32 %4, 127, !dbg !8
9
+ %6 = shl nuw nsw i32 %5, 3, !dbg !8
10
+ %7 = shl nuw nsw i32 %5, 2, !dbg !8
11
+ %8 = or i32 %7, 512, !dbg !8
12
+ %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9
13
+ %10 = shl i32 %9, 10, !dbg !10
14
+ %11 = or i32 %10, %6, !dbg !11
15
+ %12 = or i32 %10, %7, !dbg !11
16
+ %13 = or i32 %10, %8, !dbg !11
17
+ %14 = sext i32 %11 to i64, !dbg !12
18
+ %15 = getelementptr i16, ptr addrspace(1) %0, i64 %14, !dbg !12
19
+ %16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 true) #2, !dbg !13
20
+ %17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !13
21
+ %18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !13
22
+ %19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !13
23
+ %20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !13
24
+ %21 = trunc i32 %17 to i16, !dbg !13
25
+ %extelt.offset = lshr i32 %17, 16, !dbg !13
26
+ %22 = trunc i32 %extelt.offset to i16, !dbg !13
27
+ %23 = trunc i32 %18 to i16, !dbg !13
28
+ %extelt.offset1 = lshr i32 %18, 16, !dbg !13
29
+ %24 = trunc i32 %extelt.offset1 to i16, !dbg !13
30
+ %25 = trunc i32 %19 to i16, !dbg !13
31
+ %extelt.offset2 = lshr i32 %19, 16, !dbg !13
32
+ %26 = trunc i32 %extelt.offset2 to i16, !dbg !13
33
+ %27 = trunc i32 %20 to i16, !dbg !13
34
+ %extelt.offset3 = lshr i32 %20, 16, !dbg !13
35
+ %28 = trunc i32 %extelt.offset3 to i16, !dbg !13
36
+ %29 = zext nneg i32 %6 to i64, !dbg !14
37
+ %30 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %29, !dbg !14
38
+ %31 = insertelement <1 x i16> undef, i16 %21, i64 0, !dbg !14
39
+ store <1 x i16> %31, ptr addrspace(3) %30, align 2, !dbg !14
40
+ %32 = or i32 %6, 1, !dbg !14
41
+ %33 = zext nneg i32 %32 to i64, !dbg !14
42
+ %34 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %33, !dbg !14
43
+ %35 = insertelement <1 x i16> undef, i16 %22, i64 0, !dbg !14
44
+ store <1 x i16> %35, ptr addrspace(3) %34, align 2, !dbg !14
45
+ %36 = or i32 %6, 2, !dbg !14
46
+ %37 = zext nneg i32 %36 to i64, !dbg !14
47
+ %38 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %37, !dbg !14
48
+ %39 = insertelement <1 x i16> undef, i16 %23, i64 0, !dbg !14
49
+ store <1 x i16> %39, ptr addrspace(3) %38, align 2, !dbg !14
50
+ %40 = or i32 %6, 3, !dbg !14
51
+ %41 = zext nneg i32 %40 to i64, !dbg !14
52
+ %42 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %41, !dbg !14
53
+ %43 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !14
54
+ store <1 x i16> %43, ptr addrspace(3) %42, align 2, !dbg !14
55
+ %44 = or i32 %6, 4, !dbg !14
56
+ %45 = zext nneg i32 %44 to i64, !dbg !14
57
+ %46 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %45, !dbg !14
58
+ %47 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !14
59
+ store <1 x i16> %47, ptr addrspace(3) %46, align 2, !dbg !14
60
+ %48 = or i32 %6, 5, !dbg !14
61
+ %49 = zext nneg i32 %48 to i64, !dbg !14
62
+ %50 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %49, !dbg !14
63
+ %51 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !14
64
+ store <1 x i16> %51, ptr addrspace(3) %50, align 2, !dbg !14
65
+ %52 = or i32 %6, 6, !dbg !14
66
+ %53 = zext nneg i32 %52 to i64, !dbg !14
67
+ %54 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %53, !dbg !14
68
+ %55 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !14
69
+ store <1 x i16> %55, ptr addrspace(3) %54, align 2, !dbg !14
70
+ %56 = or i32 %6, 7, !dbg !14
71
+ %57 = zext nneg i32 %56 to i64, !dbg !14
72
+ %58 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %57, !dbg !14
73
+ %59 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !14
74
+ store <1 x i16> %59, ptr addrspace(3) %58, align 2, !dbg !14
75
+ tail call void @llvm.nvvm.barrier0(), !dbg !14
76
+ %60 = zext nneg i32 %7 to i64, !dbg !14
77
+ %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !14
78
+ %62 = load i16, ptr addrspace(3) %61, align 2, !dbg !14
79
+ %63 = or i32 %7, 1, !dbg !14
80
+ %64 = zext nneg i32 %63 to i64, !dbg !14
81
+ %65 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %64, !dbg !14
82
+ %66 = load i16, ptr addrspace(3) %65, align 2, !dbg !14
83
+ %67 = or i32 %7, 2, !dbg !14
84
+ %68 = zext nneg i32 %67 to i64, !dbg !14
85
+ %69 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %68, !dbg !14
86
+ %70 = load i16, ptr addrspace(3) %69, align 2, !dbg !14
87
+ %71 = or i32 %7, 3, !dbg !14
88
+ %72 = zext nneg i32 %71 to i64, !dbg !14
89
+ %73 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %72, !dbg !14
90
+ %74 = load i16, ptr addrspace(3) %73, align 2, !dbg !14
91
+ %75 = zext nneg i32 %8 to i64, !dbg !14
92
+ %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !14
93
+ %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !14
94
+ %78 = or i32 %7, 513, !dbg !14
95
+ %79 = zext nneg i32 %78 to i64, !dbg !14
96
+ %80 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %79, !dbg !14
97
+ %81 = load i16, ptr addrspace(3) %80, align 2, !dbg !14
98
+ %82 = or i32 %7, 514, !dbg !14
99
+ %83 = zext nneg i32 %82 to i64, !dbg !14
100
+ %84 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %83, !dbg !14
101
+ %85 = load i16, ptr addrspace(3) %84, align 2, !dbg !14
102
+ %86 = or i32 %7, 515, !dbg !14
103
+ %87 = zext nneg i32 %86 to i64, !dbg !14
104
+ %88 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %87, !dbg !14
105
+ %89 = load i16, ptr addrspace(3) %88, align 2, !dbg !14
106
+ %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #2, !dbg !14
107
+ %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %66) #2, !dbg !14
108
+ %92 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #2, !dbg !14
109
+ %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #2, !dbg !14
110
+ %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !14
111
+ %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %81) #2, !dbg !14
112
+ %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #2, !dbg !14
113
+ %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %89) #2, !dbg !14
114
+ %98 = sext i32 %12 to i64, !dbg !15
115
+ %99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !15
116
+ %100 = sext i32 %13 to i64, !dbg !15
117
+ %101 = getelementptr float, ptr addrspace(1) %1, i64 %100, !dbg !15
118
+ %102 = bitcast float %90 to i32, !dbg !16
119
+ %103 = bitcast float %91 to i32, !dbg !16
120
+ %104 = bitcast float %92 to i32, !dbg !16
121
+ %105 = bitcast float %93 to i32, !dbg !16
122
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %102, i32 %103, i32 %104, i32 %105, ptr addrspace(1) %99, i1 true) #2, !dbg !16
123
+ %106 = bitcast float %94 to i32, !dbg !16
124
+ %107 = bitcast float %95 to i32, !dbg !16
125
+ %108 = bitcast float %96 to i32, !dbg !16
126
+ %109 = bitcast float %97 to i32, !dbg !16
127
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %106, i32 %107, i32 %108, i32 %109, ptr addrspace(1) %101, i1 true) #2, !dbg !16
128
+ ret void, !dbg !17
129
+ }
130
+
131
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
132
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
133
+
134
+ ; Function Attrs: convergent nocallback nounwind
135
+ declare void @llvm.nvvm.barrier0() #1
136
+
137
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
138
+ attributes #1 = { convergent nocallback nounwind }
139
+ attributes #2 = { nounwind }
140
+
141
+ !llvm.module.flags = !{!0}
142
+ !llvm.dbg.cu = !{!1}
143
+ !nvvm.annotations = !{!3, !4, !4, !3}
144
+
145
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
146
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
147
+ !2 = !DIFile(filename: "cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py", directory: "/tmp/torchinductor_root/ot")
148
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
149
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
150
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
151
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
152
+ !7 = !{}
153
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
154
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
155
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
156
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
157
+ !12 = !DILocation(line: 24, column: 30, scope: !5)
158
+ !13 = !DILocation(line: 24, column: 35, scope: !5)
159
+ !14 = !DILocation(line: 24, column: 44, scope: !5)
160
+ !15 = !DILocation(line: 26, column: 25, scope: !5)
161
+ !16 = !DILocation(line: 26, column: 36, scope: !5)
162
+ !17 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2de(
13
+ .param .u64 triton__0d1d2de_param_0,
14
+ .param .u64 triton__0d1d2de_param_1,
15
+ .param .u32 triton__0d1d2de_param_2
16
+ )
17
+ .maxntid 128, 1, 1
18
+ {
19
+ .reg .pred %p<4>;
20
+ .reg .b16 %rs<9>;
21
+ .reg .b32 %r<37>;
22
+ .reg .b64 %rd<13>;
23
+ .loc 1 18 0
24
+ $L__func_begin0:
25
+ .loc 1 18 0
26
+
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_0];
28
+ ld.param.u64 %rd5, [triton__0d1d2de_param_1];
29
+ $L__tmp0:
30
+ .loc 1 21 36
31
+ mov.u32 %r22, %tid.x;
32
+ and.b32 %r23, %r22, 127;
33
+ shl.b32 %r24, %r23, 3;
34
+ shl.b32 %r25, %r23, 2;
35
+ .loc 1 20 28
36
+ mov.u32 %r1, %ctaid.x;
37
+ .loc 1 20 33
38
+ shl.b32 %r26, %r1, 10;
39
+ .loc 1 21 23
40
+ or.b32 %r27, %r26, %r24;
41
+ or.b32 %r28, %r26, %r25;
42
+ .loc 1 24 30
43
+ mul.wide.s32 %rd6, %r27, 2;
44
+ add.s64 %rd1, %rd4, %rd6;
45
+ mov.pred %p1, -1;
46
+ .loc 1 24 35
47
+ mov.u32 %r2, 0x0;
48
+ mov.u32 %r3, 0x0;
49
+ mov.u32 %r4, 0x0;
50
+ mov.u32 %r5, 0x0;
51
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
52
+ shr.u32 %r29, %r2, 16;
53
+ shr.u32 %r30, %r3, 16;
54
+ shr.u32 %r31, %r4, 16;
55
+ shr.u32 %r32, %r5, 16;
56
+ .loc 1 24 44
57
+ shl.b32 %r33, %r23, 4;
58
+ mov.u32 %r34, global_smem;
59
+ add.s32 %r35, %r34, %r33;
60
+ st.shared.u16 [%r35], %r2;
61
+ st.shared.u16 [%r35+2], %r29;
62
+ st.shared.u16 [%r35+4], %r3;
63
+ st.shared.u16 [%r35+6], %r30;
64
+ st.shared.u16 [%r35+8], %r4;
65
+ st.shared.u16 [%r35+10], %r31;
66
+ st.shared.u16 [%r35+12], %r5;
67
+ st.shared.u16 [%r35+14], %r32;
68
+ bar.sync 0;
69
+ add.s32 %r36, %r34, %r24;
70
+ ld.shared.u16 %rs1, [%r36];
71
+ ld.shared.u16 %rs2, [%r36+2];
72
+ ld.shared.u16 %rs3, [%r36+4];
73
+ ld.shared.u16 %rs4, [%r36+6];
74
+ ld.shared.u16 %rs5, [%r36+1024];
75
+ ld.shared.u16 %rs6, [%r36+1026];
76
+ ld.shared.u16 %rs7, [%r36+1028];
77
+ ld.shared.u16 %rs8, [%r36+1030];
78
+ cvt.f32.bf16 %r14, %rs1;
79
+ cvt.f32.bf16 %r15, %rs2;
80
+ cvt.f32.bf16 %r16, %rs3;
81
+ cvt.f32.bf16 %r17, %rs4;
82
+ cvt.f32.bf16 %r18, %rs5;
83
+ cvt.f32.bf16 %r19, %rs6;
84
+ cvt.f32.bf16 %r20, %rs7;
85
+ cvt.f32.bf16 %r21, %rs8;
86
+ .loc 1 26 25
87
+ mul.wide.s32 %rd7, %r28, 4;
88
+ add.s64 %rd2, %rd5, %rd7;
89
+ cvt.s64.s32 %rd8, %r26;
90
+ cvt.u64.u32 %rd9, %r25;
91
+ or.b64 %rd10, %rd8, %rd9;
92
+ shl.b64 %rd11, %rd10, 2;
93
+ add.s64 %rd12, %rd5, %rd11;
94
+ add.s64 %rd3, %rd12, 2048;
95
+ .loc 1 26 36
96
+ @%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
97
+ @%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
98
+ .loc 1 26 4
99
+ ret;
100
+ $L__tmp1:
101
+ $L__func_end0:
102
+
103
+ }
104
+ .file 1 "/tmp/torchinductor_root/ot/cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py"
105
+ .section .debug_abbrev
106
+ {
107
+ .b8 1
108
+ .b8 17
109
+ .b8 1
110
+ .b8 37
111
+ .b8 8
112
+ .b8 19
113
+ .b8 5
114
+ .b8 3
115
+ .b8 8
116
+ .b8 16
117
+ .b8 6
118
+ .b8 27
119
+ .b8 8
120
+ .b8 180
121
+ .b8 66
122
+ .b8 12
123
+ .b8 17
124
+ .b8 1
125
+ .b8 18
126
+ .b8 1
127
+ .b8 0
128
+ .b8 0
129
+ .b8 2
130
+ .b8 46
131
+ .b8 0
132
+ .b8 17
133
+ .b8 1
134
+ .b8 18
135
+ .b8 1
136
+ .b8 64
137
+ .b8 10
138
+ .b8 135
139
+ .b8 64
140
+ .b8 8
141
+ .b8 3
142
+ .b8 8
143
+ .b8 58
144
+ .b8 11
145
+ .b8 59
146
+ .b8 11
147
+ .b8 63
148
+ .b8 12
149
+ .b8 0
150
+ .b8 0
151
+ .b8 0
152
+ }
153
+ .section .debug_info
154
+ {
155
+ .b32 176
156
+ .b8 2
157
+ .b8 0
158
+ .b32 .debug_abbrev
159
+ .b8 8
160
+ .b8 1
161
+ .b8 116
162
+ .b8 114
163
+ .b8 105
164
+ .b8 116
165
+ .b8 111
166
+ .b8 110
167
+ .b8 0
168
+ .b8 2
169
+ .b8 0
170
+ .b8 99
171
+ .b8 111
172
+ .b8 116
173
+ .b8 98
174
+ .b8 104
175
+ .b8 101
176
+ .b8 116
177
+ .b8 51
178
+ .b8 55
179
+ .b8 118
180
+ .b8 54
181
+ .b8 109
182
+ .b8 104
183
+ .b8 53
184
+ .b8 115
185
+ .b8 97
186
+ .b8 109
187
+ .b8 113
188
+ .b8 108
189
+ .b8 55
190
+ .b8 117
191
+ .b8 120
192
+ .b8 114
193
+ .b8 101
194
+ .b8 51
195
+ .b8 104
196
+ .b8 112
197
+ .b8 114
198
+ .b8 112
199
+ .b8 110
200
+ .b8 98
201
+ .b8 104
202
+ .b8 117
203
+ .b8 118
204
+ .b8 105
205
+ .b8 109
206
+ .b8 51
207
+ .b8 102
208
+ .b8 109
209
+ .b8 114
210
+ .b8 106
211
+ .b8 112
212
+ .b8 113
213
+ .b8 53
214
+ .b8 102
215
+ .b8 103
216
+ .b8 103
217
+ .b8 54
218
+ .b8 108
219
+ .b8 119
220
+ .b8 98
221
+ .b8 105
222
+ .b8 46
223
+ .b8 112
224
+ .b8 121
225
+ .b8 0
226
+ .b32 .debug_line
227
+ .b8 47
228
+ .b8 116
229
+ .b8 109
230
+ .b8 112
231
+ .b8 47
232
+ .b8 116
233
+ .b8 111
234
+ .b8 114
235
+ .b8 99
236
+ .b8 104
237
+ .b8 105
238
+ .b8 110
239
+ .b8 100
240
+ .b8 117
241
+ .b8 99
242
+ .b8 116
243
+ .b8 111
244
+ .b8 114
245
+ .b8 95
246
+ .b8 114
247
+ .b8 111
248
+ .b8 111
249
+ .b8 116
250
+ .b8 47
251
+ .b8 111
252
+ .b8 116
253
+ .b8 0
254
+ .b8 1
255
+ .b64 $L__func_begin0
256
+ .b64 $L__func_end0
257
+ .b8 2
258
+ .b64 $L__func_begin0
259
+ .b64 $L__func_end0
260
+ .b8 1
261
+ .b8 156
262
+ .b8 116
263
+ .b8 114
264
+ .b8 105
265
+ .b8 116
266
+ .b8 111
267
+ .b8 110
268
+ .b8 95
269
+ .b8 95
270
+ .b8 48
271
+ .b8 100
272
+ .b8 49
273
+ .b8 100
274
+ .b8 50
275
+ .b8 100
276
+ .b8 101
277
+ .b8 0
278
+ .b8 116
279
+ .b8 114
280
+ .b8 105
281
+ .b8 116
282
+ .b8 111
283
+ .b8 110
284
+ .b8 95
285
+ .b8 95
286
+ .b8 48
287
+ .b8 100
288
+ .b8 49
289
+ .b8 100
290
+ .b8 50
291
+ .b8 100
292
+ .b8 101
293
+ .b8 0
294
+ .b8 1
295
+ .b8 18
296
+ .b8 1
297
+ .b8 0
298
+ }
299
+ .section .debug_pubnames
300
+ {
301
+ .b32 $L__pubNames_end0-$L__pubNames_start0
302
+ $L__pubNames_start0:
303
+ .b8 2
304
+ .b8 0
305
+ .b32 .debug_info
306
+ .b32 180
307
+ .b32 125
308
+ .b8 116
309
+ .b8 114
310
+ .b8 105
311
+ .b8 116
312
+ .b8 111
313
+ .b8 110
314
+ .b8 95
315
+ .b8 95
316
+ .b8 48
317
+ .b8 100
318
+ .b8 49
319
+ .b8 100
320
+ .b8 50
321
+ .b8 100
322
+ .b8 101
323
+ .b8 0
324
+ .b32 0
325
+ $L__pubNames_end0:
326
+ }
327
+ .section .debug_pubtypes
328
+ {
329
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
330
+ $L__pubTypes_start0:
331
+ .b8 2
332
+ .b8 0
333
+ .b32 .debug_info
334
+ .b32 180
335
+ .b32 0
336
+ $L__pubTypes_end0:
337
+ }
338
+ .section .debug_loc { }
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx ADDED
@@ -0,0 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4de(
13
+ .param .u64 triton__0d1d2d3de4de_param_0,
14
+ .param .u64 triton__0d1d2d3de4de_param_1,
15
+ .param .u64 triton__0d1d2d3de4de_param_2,
16
+ .param .u32 triton__0d1d2d3de4de_param_3,
17
+ .param .u32 triton__0d1d2d3de4de_param_4
18
+ )
19
+ .maxntid 128, 1, 1
20
+ {
21
+ .reg .pred %p<22>;
22
+ .reg .b32 %r<98>;
23
+ .reg .f32 %f<47>;
24
+ .reg .b64 %rd<9>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2];
30
+ ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1];
31
+ ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0];
32
+ $L__tmp0:
33
+ .loc 1 22 44
34
+ mov.u32 %r1, %tid.x;
35
+ and.b32 %r2, %r1, 31;
36
+ shl.b32 %r13, %r1, 2;
37
+ and.b32 %r3, %r13, 60;
38
+ .loc 1 24 33
39
+ bfe.u32 %r4, %r1, 5, 2;
40
+ .loc 1 21 28
41
+ mov.u32 %r11, %ctaid.x;
42
+ .loc 1 21 33
43
+ shl.b32 %r5, %r11, 6;
44
+ .loc 1 22 23
45
+ or.b32 %r14, %r5, %r3;
46
+ .loc 1 26 20
47
+ shr.s32 %r16, %r14, 31;
48
+ shr.u32 %r17, %r16, 24;
49
+ add.s32 %r18, %r14, %r17;
50
+ shr.s32 %r19, %r18, 8;
51
+ .loc 1 29 36
52
+ mad.lo.s32 %r20, %r19, 32512, %r14;
53
+ shl.b32 %r21, %r4, 9;
54
+ add.s32 %r22, %r20, %r21;
55
+ shl.b32 %r23, %r1, 4;
56
+ and.b32 %r24, %r23, 256;
57
+ add.s32 %r96, %r22, %r24;
58
+ mov.f32 %f43, 0f00000000;
59
+ mov.b32 %r97, -8;
60
+ mov.pred %p1, -1;
61
+ mov.f32 %f44, %f43;
62
+ mov.f32 %f45, %f43;
63
+ mov.f32 %f46, %f43;
64
+ $L__BB0_1:
65
+ .loc 1 33 34
66
+ mul.wide.s32 %rd6, %r96, 4;
67
+ add.s64 %rd4, %rd1, %rd6;
68
+ mov.b32 %r29, 0;
69
+ .loc 1 33 63
70
+ mov.u32 %r25, 0x0;
71
+ mov.u32 %r26, 0x0;
72
+ mov.u32 %r27, 0x0;
73
+ mov.u32 %r28, 0x0;
74
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd4 + 0 ];
75
+ @!%p1 mov.u32 %r25, %r29;
76
+ @!%p1 mov.u32 %r26, %r29;
77
+ @!%p1 mov.u32 %r27, %r29;
78
+ @!%p1 mov.u32 %r28, %r29;
79
+ .loc 1 34 34
80
+ add.s64 %rd5, %rd2, %rd6;
81
+ .loc 1 34 63
82
+ mov.u32 %r33, 0x0;
83
+ mov.u32 %r34, 0x0;
84
+ mov.u32 %r35, 0x0;
85
+ mov.u32 %r36, 0x0;
86
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ];
87
+ @!%p1 mov.u32 %r33, %r29;
88
+ @!%p1 mov.u32 %r34, %r29;
89
+ @!%p1 mov.u32 %r35, %r29;
90
+ @!%p1 mov.u32 %r36, %r29;
91
+ .loc 1 33 63
92
+ mov.b32 %f13, %r25;
93
+ mov.b32 %f14, %r26;
94
+ mov.b32 %f15, %r27;
95
+ mov.b32 %f16, %r28;
96
+ .loc 1 34 63
97
+ mov.b32 %f17, %r33;
98
+ mov.b32 %f18, %r34;
99
+ mov.b32 %f19, %r35;
100
+ mov.b32 %f20, %r36;
101
+ .loc 1 38 38
102
+ fma.rn.f32 %f46, %f16, %f20, %f46;
103
+ fma.rn.f32 %f45, %f15, %f19, %f45;
104
+ fma.rn.f32 %f44, %f14, %f18, %f44;
105
+ fma.rn.f32 %f43, %f13, %f17, %f43;
106
+ .loc 1 29 36
107
+ add.s32 %r97, %r97, 8;
108
+ add.s32 %r96, %r96, 2048;
109
+ setp.lt.u32 %p11, %r97, 120;
110
+ @%p11 bra $L__BB0_1;
111
+ .loc 1 22 44
112
+ and.b32 %r58, %r1, 63;
113
+ .loc 1 22 23
114
+ or.b32 %r59, %r5, %r58;
115
+ $L__tmp1:
116
+ .loc 2 243 36
117
+ mov.b32 %r60, %f43;
118
+ shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1;
119
+ mov.b32 %f21, %r61;
120
+ $L__tmp2:
121
+ .loc 2 233 15
122
+ add.f32 %f22, %f43, %f21;
123
+ $L__tmp3:
124
+ .loc 2 243 36
125
+ mov.b32 %r62, %f44;
126
+ shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1;
127
+ mov.b32 %f23, %r63;
128
+ $L__tmp4:
129
+ .loc 2 233 15
130
+ add.f32 %f24, %f44, %f23;
131
+ $L__tmp5:
132
+ .loc 2 243 36
133
+ mov.b32 %r64, %f45;
134
+ shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1;
135
+ mov.b32 %f25, %r65;
136
+ $L__tmp6:
137
+ .loc 2 233 15
138
+ add.f32 %f26, %f45, %f25;
139
+ $L__tmp7:
140
+ .loc 2 243 36
141
+ mov.b32 %r66, %f46;
142
+ shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
143
+ mov.b32 %f27, %r67;
144
+ $L__tmp8:
145
+ .loc 2 233 15
146
+ add.f32 %f28, %f46, %f27;
147
+ $L__tmp9:
148
+ .loc 2 243 36
149
+ setp.lt.u32 %p12, %r2, 16;
150
+ shl.b32 %r68, %r3, 2;
151
+ or.b32 %r69, %r68, %r4;
152
+ shl.b32 %r70, %r69, 2;
153
+ mov.u32 %r71, global_smem;
154
+ add.s32 %r41, %r71, %r70;
155
+ mov.b32 %r42, %f22;
156
+ @%p12 st.shared.b32 [ %r41 + 0 ], %r42;
157
+ shl.b32 %r72, %r4, 2;
158
+ shl.b32 %r73, %r3, 4;
159
+ or.b32 %r74, %r73, 16;
160
+ or.b32 %r75, %r74, %r72;
161
+ add.s32 %r43, %r71, %r75;
162
+ mov.b32 %r44, %f24;
163
+ @%p12 st.shared.b32 [ %r43 + 0 ], %r44;
164
+ or.b32 %r76, %r73, 32;
165
+ or.b32 %r77, %r76, %r72;
166
+ add.s32 %r45, %r71, %r77;
167
+ mov.b32 %r46, %f26;
168
+ @%p12 st.shared.b32 [ %r45 + 0 ], %r46;
169
+ or.b32 %r78, %r73, 48;
170
+ or.b32 %r79, %r78, %r72;
171
+ add.s32 %r47, %r71, %r79;
172
+ mov.b32 %r48, %f28;
173
+ @%p12 st.shared.b32 [ %r47 + 0 ], %r48;
174
+ bar.sync 0;
175
+ setp.lt.s32 %p16, %r1, 256;
176
+ add.s32 %r50, %r71, %r13;
177
+ @%p16 ld.shared.b32 %r49, [ %r50 + 0 ];
178
+ mov.b32 %f29, %r49;
179
+ shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1;
180
+ mov.b32 %f30, %r81;
181
+ $L__tmp10:
182
+ .loc 2 233 15
183
+ add.f32 %f31, %f29, %f30;
184
+ $L__tmp11:
185
+ .loc 2 243 36
186
+ mov.b32 %r82, %f31;
187
+ shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
188
+ mov.b32 %f32, %r83;
189
+ $L__tmp12:
190
+ .loc 2 233 15
191
+ add.f32 %f33, %f31, %f32;
192
+ $L__tmp13:
193
+ .loc 2 243 36
194
+ and.b32 %r84, %r1, 3;
195
+ setp.eq.s32 %p21, %r84, 0;
196
+ and.pred %p17, %p16, %p21;
197
+ mov.b32 %r52, %f33;
198
+ @%p17 st.shared.b32 [ %r50 + 0 ], %r52;
199
+ add.s32 %r54, %r50, 512;
200
+ @%p16 ld.shared.b32 %r53, [ %r54 + 0 ];
201
+ mov.b32 %f34, %r53;
202
+ shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1;
203
+ mov.b32 %f35, %r85;
204
+ $L__tmp14:
205
+ .loc 2 233 15
206
+ add.f32 %f36, %f34, %f35;
207
+ $L__tmp15:
208
+ .loc 2 243 36
209
+ mov.b32 %r86, %f36;
210
+ shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
211
+ mov.b32 %f37, %r87;
212
+ $L__tmp16:
213
+ .loc 2 233 15
214
+ add.f32 %f38, %f36, %f37;
215
+ $L__tmp17:
216
+ .loc 2 243 36
217
+ mov.b32 %r56, %f38;
218
+ @%p17 st.shared.b32 [ %r54 + 0 ], %r56;
219
+ bar.sync 0;
220
+ add.s32 %r88, %r71, %r73;
221
+ ld.shared.f32 %f39, [%r88];
222
+ add.s32 %r89, %r71, %r74;
223
+ ld.shared.f32 %f40, [%r89];
224
+ add.s32 %r90, %r71, %r76;
225
+ ld.shared.f32 %f41, [%r90];
226
+ add.s32 %r91, %r71, %r78;
227
+ ld.shared.f32 %f42, [%r91];
228
+ $L__tmp18:
229
+ .loc 1 39 28
230
+ bar.sync 0;
231
+ add.s32 %r92, %r71, %r68;
232
+ st.shared.f32 [%r92], %f39;
233
+ st.shared.f32 [%r92+4], %f40;
234
+ st.shared.f32 [%r92+8], %f41;
235
+ st.shared.f32 [%r92+12], %f42;
236
+ bar.sync 0;
237
+ shl.b32 %r93, %r58, 2;
238
+ add.s32 %r94, %r71, %r93;
239
+ ld.shared.u32 %r57, [%r94];
240
+ .loc 1 40 25
241
+ mul.wide.s32 %rd8, %r59, 4;
242
+ add.s64 %rd7, %rd3, %rd8;
243
+ .loc 1 40 36
244
+ and.b32 %r95, %r1, 64;
245
+ setp.eq.s32 %p20, %r95, 0;
246
+ @%p20 st.global.b32 [ %rd7 + 0 ], { %r57 };
247
+ .loc 1 40 4
248
+ ret;
249
+ $L__tmp19:
250
+ $L__func_end0:
251
+
252
+ }
253
+ .file 1 "/tmp/torchinductor_root/qd/cqdvltndxc7vwj5j5dnsb73tk763gajftjwvmbfq7i6sitk5gwoy.py"
254
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
255
+ .section .debug_abbrev
256
+ {
257
+ .b8 1
258
+ .b8 17
259
+ .b8 1
260
+ .b8 37
261
+ .b8 8
262
+ .b8 19
263
+ .b8 5
264
+ .b8 3
265
+ .b8 8
266
+ .b8 16
267
+ .b8 6
268
+ .b8 27
269
+ .b8 8
270
+ .b8 180
271
+ .b8 66
272
+ .b8 12
273
+ .b8 17
274
+ .b8 1
275
+ .b8 18
276
+ .b8 1
277
+ .b8 0
278
+ .b8 0
279
+ .b8 2
280
+ .b8 46
281
+ .b8 0
282
+ .b8 135
283
+ .b8 64
284
+ .b8 8
285
+ .b8 3
286
+ .b8 8
287
+ .b8 58
288
+ .b8 11
289
+ .b8 59
290
+ .b8 11
291
+ .b8 63
292
+ .b8 12
293
+ .b8 32
294
+ .b8 11
295
+ .b8 0
296
+ .b8 0
297
+ .b8 3
298
+ .b8 46
299
+ .b8 1
300
+ .b8 17
301
+ .b8 1
302
+ .b8 18
303
+ .b8 1
304
+ .b8 64
305
+ .b8 10
306
+ .b8 49
307
+ .b8 19
308
+ .b8 0
309
+ .b8 0
310
+ .b8 4
311
+ .b8 29
312
+ .b8 0
313
+ .b8 49
314
+ .b8 19
315
+ .b8 17
316
+ .b8 1
317
+ .b8 18
318
+ .b8 1
319
+ .b8 88
320
+ .b8 11
321
+ .b8 89
322
+ .b8 11
323
+ .b8 87
324
+ .b8 11
325
+ .b8 0
326
+ .b8 0
327
+ .b8 5
328
+ .b8 29
329
+ .b8 1
330
+ .b8 49
331
+ .b8 19
332
+ .b8 17
333
+ .b8 1
334
+ .b8 18
335
+ .b8 1
336
+ .b8 88
337
+ .b8 11
338
+ .b8 89
339
+ .b8 11
340
+ .b8 87
341
+ .b8 11
342
+ .b8 0
343
+ .b8 0
344
+ .b8 0
345
+ }
346
+ .section .debug_info
347
+ {
348
+ .b32 266
349
+ .b8 2
350
+ .b8 0
351
+ .b32 .debug_abbrev
352
+ .b8 8
353
+ .b8 1
354
+ .b8 116
355
+ .b8 114
356
+ .b8 105
357
+ .b8 116
358
+ .b8 111
359
+ .b8 110
360
+ .b8 0
361
+ .b8 2
362
+ .b8 0
363
+ .b8 99
364
+ .b8 113
365
+ .b8 100
366
+ .b8 118
367
+ .b8 108
368
+ .b8 116
369
+ .b8 110
370
+ .b8 100
371
+ .b8 120
372
+ .b8 99
373
+ .b8 55
374
+ .b8 118
375
+ .b8 119
376
+ .b8 106
377
+ .b8 53
378
+ .b8 106
379
+ .b8 53
380
+ .b8 100
381
+ .b8 110
382
+ .b8 115
383
+ .b8 98
384
+ .b8 55
385
+ .b8 51
386
+ .b8 116
387
+ .b8 107
388
+ .b8 55
389
+ .b8 54
390
+ .b8 51
391
+ .b8 103
392
+ .b8 97
393
+ .b8 106
394
+ .b8 102
395
+ .b8 116
396
+ .b8 106
397
+ .b8 119
398
+ .b8 118
399
+ .b8 109
400
+ .b8 98
401
+ .b8 102
402
+ .b8 113
403
+ .b8 55
404
+ .b8 105
405
+ .b8 54
406
+ .b8 115
407
+ .b8 105
408
+ .b8 116
409
+ .b8 107
410
+ .b8 53
411
+ .b8 103
412
+ .b8 119
413
+ .b8 111
414
+ .b8 121
415
+ .b8 46
416
+ .b8 112
417
+ .b8 121
418
+ .b8 0
419
+ .b32 .debug_line
420
+ .b8 47
421
+ .b8 116
422
+ .b8 109
423
+ .b8 112
424
+ .b8 47
425
+ .b8 116
426
+ .b8 111
427
+ .b8 114
428
+ .b8 99
429
+ .b8 104
430
+ .b8 105
431
+ .b8 110
432
+ .b8 100
433
+ .b8 117
434
+ .b8 99
435
+ .b8 116
436
+ .b8 111
437
+ .b8 114
438
+ .b8 95
439
+ .b8 114
440
+ .b8 111
441
+ .b8 111
442
+ .b8 116
443
+ .b8 47
444
+ .b8 113
445
+ .b8 100
446
+ .b8 0
447
+ .b8 1
448
+ .b64 $L__func_begin0
449
+ .b64 $L__func_end0
450
+ .b8 2
451
+ .b8 116
452
+ .b8 114
453
+ .b8 105
454
+ .b8 116
455
+ .b8 111
456
+ .b8 110
457
+ .b8 95
458
+ .b8 95
459
+ .b8 48
460
+ .b8 100
461
+ .b8 49
462
+ .b8 100
463
+ .b8 50
464
+ .b8 100
465
+ .b8 51
466
+ .b8 100
467
+ .b8 101
468
+ .b8 52
469
+ .b8 100
470
+ .b8 101
471
+ .b8 0
472
+ .b8 116
473
+ .b8 114
474
+ .b8 105
475
+ .b8 116
476
+ .b8 111
477
+ .b8 110
478
+ .b8 95
479
+ .b8 95
480
+ .b8 48
481
+ .b8 100
482
+ .b8 49
483
+ .b8 100
484
+ .b8 50
485
+ .b8 100
486
+ .b8 51
487
+ .b8 100
488
+ .b8 101
489
+ .b8 52
490
+ .b8 100
491
+ .b8 101
492
+ .b8 0
493
+ .b8 1
494
+ .b8 18
495
+ .b8 1
496
+ .b8 1
497
+ .b8 3
498
+ .b64 $L__func_begin0
499
+ .b64 $L__func_end0
500
+ .b8 1
501
+ .b8 156
502
+ .b32 125
503
+ .b8 4
504
+ .b32 125
505
+ .b64 $L__tmp1
506
+ .b64 $L__tmp18
507
+ .b8 2
508
+ .b8 39
509
+ .b8 25
510
+ .b8 5
511
+ .b32 125
512
+ .b64 $L__tmp2
513
+ .b64 $L__tmp17
514
+ .b8 2
515
+ .b8 39
516
+ .b8 25
517
+ .b8 4
518
+ .b32 125
519
+ .b64 $L__tmp2
520
+ .b64 $L__tmp17
521
+ .b8 2
522
+ .b8 243
523
+ .b8 36
524
+ .b8 0
525
+ .b8 0
526
+ .b8 0
527
+ }
528
+ .section .debug_pubnames
529
+ {
530
+ .b32 $L__pubNames_end0-$L__pubNames_start0
531
+ $L__pubNames_start0:
532
+ .b8 2
533
+ .b8 0
534
+ .b32 .debug_info
535
+ .b32 270
536
+ .b32 125
537
+ .b8 116
538
+ .b8 114
539
+ .b8 105
540
+ .b8 116
541
+ .b8 111
542
+ .b8 110
543
+ .b8 95
544
+ .b8 95
545
+ .b8 48
546
+ .b8 100
547
+ .b8 49
548
+ .b8 100
549
+ .b8 50
550
+ .b8 100
551
+ .b8 51
552
+ .b8 100
553
+ .b8 101
554
+ .b8 52
555
+ .b8 100
556
+ .b8 101
557
+ .b8 0
558
+ .b32 0
559
+ $L__pubNames_end0:
560
+ }
561
+ .section .debug_pubtypes
562
+ {
563
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
564
+ $L__pubTypes_start0:
565
+ .b8 2
566
+ .b8 0
567
+ .b32 .debug_info
568
+ .b32 270
569
+ .b32 0
570
+ $L__pubTypes_end0:
571
+ }
572
+ .section .debug_loc { }
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin ADDED
Binary file (16.5 kB). View file
 
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 31, !dbg !8
9
+ %8 = lshr i32 %6, 5, !dbg !8
10
+ %9 = shl i32 %6, 2, !dbg !8
11
+ %10 = and i32 %9, 60, !dbg !8
12
+ %11 = and i32 %8, 3, !dbg !9
13
+ %12 = lshr i32 %7, 4, !dbg !9
14
+ %13 = shl nuw nsw i32 %11, 1, !dbg !9
15
+ %14 = or i32 %13, %12, !dbg !9
16
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
17
+ %16 = shl i32 %15, 6, !dbg !11
18
+ %17 = or i32 %16, %10, !dbg !12
19
+ %.frozen = freeze i32 %17
20
+ %18 = sdiv i32 %.frozen, 256, !dbg !13
21
+ %19 = mul i32 %18, 256
22
+ %.decomposed = sub i32 %.frozen, %19
23
+ %20 = shl i32 %18, 15, !dbg !14
24
+ %21 = add i32 %20, %.decomposed
25
+ br label %22, !dbg !15
26
+
27
+ 22: ; preds = %5, %22
28
+ %23 = phi i32 [ 0, %5 ], [ %58, %22 ]
29
+ %24 = phi <4 x float> [ zeroinitializer, %5 ], [ %57, %22 ]
30
+ %25 = or i32 %23, %14, !dbg !16
31
+ %26 = shl i32 %25, 8, !dbg !17
32
+ %27 = add i32 %21, %26, !dbg !18
33
+ %28 = sext i32 %27 to i64, !dbg !19
34
+ %29 = getelementptr i16, ptr addrspace(1) %0, i64 %28, !dbg !19
35
+ %30 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
36
+ %31 = extractvalue { i32, i32 } %30, 0, !dbg !20
37
+ %32 = extractvalue { i32, i32 } %30, 1, !dbg !20
38
+ %33 = trunc i32 %31 to i16, !dbg !20
39
+ %extelt.offset = lshr i32 %31, 16, !dbg !20
40
+ %34 = trunc i32 %extelt.offset to i16, !dbg !20
41
+ %35 = trunc i32 %32 to i16, !dbg !20
42
+ %extelt.offset1 = lshr i32 %32, 16, !dbg !20
43
+ %36 = trunc i32 %extelt.offset1 to i16, !dbg !20
44
+ %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #3, !dbg !21
45
+ %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #3, !dbg !21
46
+ %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #3, !dbg !21
47
+ %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #3, !dbg !21
48
+ %41 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !22
49
+ %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !23
50
+ %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !23
51
+ %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !23
52
+ %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !23
53
+ %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !23
54
+ %47 = insertelement <4 x i32> poison, i32 %43, i64 0, !dbg !23
55
+ %48 = insertelement <4 x i32> %47, i32 %44, i64 1, !dbg !23
56
+ %49 = insertelement <4 x i32> %48, i32 %45, i64 2, !dbg !23
57
+ %50 = insertelement <4 x i32> %49, i32 %46, i64 3, !dbg !23
58
+ %51 = bitcast <4 x i32> %50 to <4 x float>, !dbg !23
59
+ %52 = insertelement <4 x float> poison, float %37, i64 0, !dbg !24
60
+ %53 = insertelement <4 x float> %52, float %38, i64 1, !dbg !24
61
+ %54 = insertelement <4 x float> %53, float %39, i64 2, !dbg !24
62
+ %55 = insertelement <4 x float> %54, float %40, i64 3, !dbg !24
63
+ %56 = fmul <4 x float> %55, %51, !dbg !24
64
+ %57 = fadd <4 x float> %24, %56, !dbg !25
65
+ %58 = add nuw nsw i32 %23, 8, !dbg !15
66
+ %59 = icmp ult i32 %23, 120, !dbg !15
67
+ br i1 %59, label %22, label %60, !dbg !15
68
+
69
+ 60: ; preds = %22
70
+ %61 = and i32 %6, 63, !dbg !8
71
+ %62 = or i32 %16, %61, !dbg !12
72
+ %63 = or i32 %10, 3, !dbg !26
73
+ %64 = or i32 %10, 2, !dbg !26
74
+ %65 = or i32 %10, 1, !dbg !26
75
+ %66 = extractelement <4 x float> %57, i64 0, !dbg !26
76
+ %67 = bitcast float %66 to i32, !dbg !26
77
+ %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !26
78
+ %69 = bitcast i32 %68 to float, !dbg !26
79
+ %70 = fadd float %66, %69, !dbg !30
80
+ %71 = extractelement <4 x float> %57, i64 1, !dbg !26
81
+ %72 = bitcast float %71 to i32, !dbg !26
82
+ %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !26
83
+ %74 = bitcast i32 %73 to float, !dbg !26
84
+ %75 = fadd float %71, %74, !dbg !30
85
+ %76 = extractelement <4 x float> %57, i64 2, !dbg !26
86
+ %77 = bitcast float %76 to i32, !dbg !26
87
+ %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !26
88
+ %79 = bitcast i32 %78 to float, !dbg !26
89
+ %80 = fadd float %76, %79, !dbg !30
90
+ %81 = extractelement <4 x float> %57, i64 3, !dbg !26
91
+ %82 = bitcast float %81 to i32, !dbg !26
92
+ %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 16, i32 31), !dbg !26
93
+ %84 = bitcast i32 %83 to float, !dbg !26
94
+ %85 = fadd float %81, %84, !dbg !30
95
+ %86 = icmp ult i32 %7, 16, !dbg !26
96
+ %87 = shl nuw nsw i32 %10, 2, !dbg !26
97
+ %88 = or i32 %87, %11, !dbg !26
98
+ %89 = zext nneg i32 %88 to i64, !dbg !26
99
+ %90 = getelementptr float, ptr addrspace(3) @global_smem, i64 %89, !dbg !26
100
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %90, float %70, i1 %86) #3, !dbg !26
101
+ %91 = shl nuw nsw i32 %65, 2, !dbg !26
102
+ %92 = or i32 %91, %11, !dbg !26
103
+ %93 = zext nneg i32 %92 to i64, !dbg !26
104
+ %94 = getelementptr float, ptr addrspace(3) @global_smem, i64 %93, !dbg !26
105
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %75, i1 %86) #3, !dbg !26
106
+ %95 = shl nuw nsw i32 %64, 2, !dbg !26
107
+ %96 = or i32 %95, %11, !dbg !26
108
+ %97 = zext nneg i32 %96 to i64, !dbg !26
109
+ %98 = getelementptr float, ptr addrspace(3) @global_smem, i64 %97, !dbg !26
110
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, float %80, i1 %86) #3, !dbg !26
111
+ %99 = shl nuw nsw i32 %63, 2, !dbg !26
112
+ %100 = or i32 %99, %11, !dbg !26
113
+ %101 = zext nneg i32 %100 to i64, !dbg !26
114
+ %102 = getelementptr float, ptr addrspace(3) @global_smem, i64 %101, !dbg !26
115
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %102, float %85, i1 %86) #3, !dbg !26
116
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
117
+ %103 = icmp slt i32 %6, 256, !dbg !26
118
+ %104 = sext i32 %6 to i64, !dbg !26
119
+ %105 = getelementptr float, ptr addrspace(3) @global_smem, i64 %104, !dbg !26
120
+ %106 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %105, i1 %103) #3, !dbg !26
121
+ %107 = bitcast float %106 to i32, !dbg !26
122
+ %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 2, i32 31), !dbg !26
123
+ %109 = bitcast i32 %108 to float, !dbg !26
124
+ %110 = fadd float %106, %109, !dbg !30
125
+ %111 = bitcast float %110 to i32, !dbg !26
126
+ %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 1, i32 31), !dbg !26
127
+ %113 = bitcast i32 %112 to float, !dbg !26
128
+ %114 = fadd float %110, %113, !dbg !30
129
+ %115 = and i32 %6, 3, !dbg !26
130
+ %116 = icmp eq i32 %115, 0, !dbg !26
131
+ %117 = and i1 %103, %116, !dbg !26
132
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, float %114, i1 %117) #3, !dbg !26
133
+ %118 = add i32 %6, 128, !dbg !26
134
+ %119 = sext i32 %118 to i64, !dbg !26
135
+ %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !26
136
+ %121 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %120, i1 %103) #3, !dbg !26
137
+ %122 = bitcast float %121 to i32, !dbg !26
138
+ %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !26
139
+ %124 = bitcast i32 %123 to float, !dbg !26
140
+ %125 = fadd float %121, %124, !dbg !30
141
+ %126 = bitcast float %125 to i32, !dbg !26
142
+ %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 1, i32 31), !dbg !26
143
+ %128 = bitcast i32 %127 to float, !dbg !26
144
+ %129 = fadd float %125, %128, !dbg !30
145
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %129, i1 %117) #3, !dbg !26
146
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
147
+ %130 = zext nneg i32 %87 to i64, !dbg !26
148
+ %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !26
149
+ %132 = load float, ptr addrspace(3) %131, align 4, !dbg !26
150
+ %133 = zext nneg i32 %91 to i64, !dbg !26
151
+ %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !26
152
+ %135 = load float, ptr addrspace(3) %134, align 4, !dbg !26
153
+ %136 = zext nneg i32 %95 to i64, !dbg !26
154
+ %137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !26
155
+ %138 = load float, ptr addrspace(3) %137, align 4, !dbg !26
156
+ %139 = zext nneg i32 %99 to i64, !dbg !26
157
+ %140 = getelementptr float, ptr addrspace(3) @global_smem, i64 %139, !dbg !26
158
+ %141 = load float, ptr addrspace(3) %140, align 4, !dbg !26
159
+ tail call void @llvm.nvvm.barrier0(), !dbg !34
160
+ %142 = zext nneg i32 %10 to i64, !dbg !34
161
+ %143 = getelementptr float, ptr addrspace(3) @global_smem, i64 %142, !dbg !34
162
+ %144 = insertelement <1 x float> undef, float %132, i64 0, !dbg !34
163
+ store <1 x float> %144, ptr addrspace(3) %143, align 4, !dbg !34
164
+ %145 = zext nneg i32 %65 to i64, !dbg !34
165
+ %146 = getelementptr float, ptr addrspace(3) @global_smem, i64 %145, !dbg !34
166
+ %147 = insertelement <1 x float> undef, float %135, i64 0, !dbg !34
167
+ store <1 x float> %147, ptr addrspace(3) %146, align 4, !dbg !34
168
+ %148 = zext nneg i32 %64 to i64, !dbg !34
169
+ %149 = getelementptr float, ptr addrspace(3) @global_smem, i64 %148, !dbg !34
170
+ %150 = insertelement <1 x float> undef, float %138, i64 0, !dbg !34
171
+ store <1 x float> %150, ptr addrspace(3) %149, align 4, !dbg !34
172
+ %151 = zext nneg i32 %63 to i64, !dbg !34
173
+ %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !34
174
+ %153 = insertelement <1 x float> undef, float %141, i64 0, !dbg !34
175
+ store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !34
176
+ tail call void @llvm.nvvm.barrier0(), !dbg !34
177
+ %154 = zext nneg i32 %61 to i64, !dbg !34
178
+ %155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !34
179
+ %156 = load i32, ptr addrspace(3) %155, align 4, !dbg !34
180
+ %157 = sext i32 %62 to i64, !dbg !35
181
+ %158 = getelementptr float, ptr addrspace(1) %2, i64 %157, !dbg !35
182
+ %159 = and i32 %6, 64, !dbg !36
183
+ %160 = icmp eq i32 %159, 0, !dbg !36
184
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %156, ptr addrspace(1) %158, i1 %160) #3, !dbg !36
185
+ ret void, !dbg !37
186
+ }
187
+
188
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
189
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
190
+
191
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
192
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
193
+
194
+ ; Function Attrs: convergent nocallback nounwind
195
+ declare void @llvm.nvvm.barrier0() #2
196
+
197
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
198
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
199
+ attributes #2 = { convergent nocallback nounwind }
200
+ attributes #3 = { nounwind }
201
+
202
+ !llvm.module.flags = !{!0}
203
+ !llvm.dbg.cu = !{!1}
204
+ !nvvm.annotations = !{!3, !4, !4, !3}
205
+
206
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
207
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
208
+ !2 = !DIFile(filename: "csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py", directory: "/tmp/torchinductor_root/sj")
209
+ !3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1}
210
+ !4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128}
211
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
212
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
213
+ !7 = !{}
214
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
215
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
216
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
217
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
218
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
219
+ !13 = !DILocation(line: 26, column: 20, scope: !5)
220
+ !14 = !DILocation(line: 33, column: 57, scope: !5)
221
+ !15 = !DILocation(line: 29, column: 36, scope: !5)
222
+ !16 = !DILocation(line: 30, column: 27, scope: !5)
223
+ !17 = !DILocation(line: 33, column: 44, scope: !5)
224
+ !18 = !DILocation(line: 33, column: 51, scope: !5)
225
+ !19 = !DILocation(line: 33, column: 34, scope: !5)
226
+ !20 = !DILocation(line: 33, column: 63, scope: !5)
227
+ !21 = !DILocation(line: 33, column: 115, scope: !5)
228
+ !22 = !DILocation(line: 34, column: 34, scope: !5)
229
+ !23 = !DILocation(line: 34, column: 63, scope: !5)
230
+ !24 = !DILocation(line: 36, column: 22, scope: !5)
231
+ !25 = !DILocation(line: 39, column: 38, scope: !5)
232
+ !26 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !29)
233
+ !27 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
234
+ !28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
235
+ !29 = !DILocation(line: 40, column: 25, scope: !27)
236
+ !30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !32)
237
+ !31 = distinct !DILexicalBlockFile(scope: !27, file: !28, discriminator: 0)
238
+ !32 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !33)
239
+ !33 = !DILocation(line: 40, column: 25, scope: !31)
240
+ !34 = !DILocation(line: 40, column: 28, scope: !5)
241
+ !35 = !DILocation(line: 41, column: 25, scope: !5)
242
+ !36 = !DILocation(line: 41, column: 36, scope: !5)
243
+ !37 = !DILocation(line: 41, column: 4, scope: !5)
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
4
+ %c8_i32 = arith.constant 8 : i32
5
+ %c128_i32 = arith.constant 128 : i32
6
+ %c0_i32 = arith.constant 0 : i32
7
+ %cst_0 = arith.constant dense<32768> : tensor<64x1xi32>
8
+ %cst_1 = arith.constant dense<256> : tensor<1x8xi32>
9
+ %cst_2 = arith.constant dense<128> : tensor<1x8xi32>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
11
+ %cst_4 = arith.constant dense<256> : tensor<64x1xi32>
12
+ %c64_i32 = arith.constant 64 : i32
13
+ %0 = tt.get_program_id x : i32
14
+ %1 = arith.muli %0, %c64_i32 : i32
15
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
16
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
17
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
18
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
19
+ %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
20
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
21
+ %8 = arith.remsi %5, %cst_4 : tensor<64x1xi32>
22
+ %9 = arith.divsi %5, %cst_4 : tensor<64x1xi32>
23
+ %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
24
+ %11 = arith.muli %9, %cst_0 : tensor<64x1xi32>
25
+ %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
26
+ %13 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
27
+ %14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
28
+ %15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32>) : i32 {
29
+ %20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
30
+ %21 = arith.addi %20, %7 : tensor<1x8xi32>
31
+ %22 = arith.cmpi slt, %21, %cst_2 : tensor<1x8xi32>
32
+ %23 = arith.muli %21, %cst_1 : tensor<1x8xi32>
33
+ %24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32>
34
+ %25 = arith.addi %10, %24 : tensor<64x8xi32>
35
+ %26 = arith.addi %25, %12 : tensor<64x8xi32>
36
+ %27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
37
+ %28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1>
38
+ %29 = tt.load %27, %28, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
39
+ %30 = arith.extf %29 : tensor<64x8xbf16> to tensor<64x8xf32>
40
+ %31 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
41
+ %32 = tt.load %31, %28, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
42
+ %33 = arith.mulf %30, %32 : tensor<64x8xf32>
43
+ %34 = arith.addf %arg6, %33 : tensor<64x8xf32>
44
+ %35 = arith.select %28, %34, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
45
+ scf.yield %35 : tensor<64x8xf32>
46
+ }
47
+ %16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({
48
+ ^bb0(%arg5: f32, %arg6: f32):
49
+ %20 = arith.addf %arg5, %arg6 : f32
50
+ tt.reduce.return %20 : f32
51
+ }) : (tensor<64x8xf32>) -> tensor<64xf32>
52
+ %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
53
+ %18 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
54
+ %19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
55
+ tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
56
+ tt.return
57
+ }
58
+ }
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ttgir ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<1024xi32, #blocked>
5
+ %cst_0 = arith.constant dense<3> : tensor<1024xi32, #blocked>
6
+ %cst_1 = arith.constant dense<768> : tensor<1024xi32, #blocked>
7
+ %cst_2 = arith.constant dense<2> : tensor<1024xi32, #blocked>
8
+ %cst_3 = arith.constant dense<0> : tensor<1024xi32, #blocked>
9
+ %cst_4 = arith.constant dense<1> : tensor<1024xi32, #blocked>
10
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
11
+ %c1024_i32 = arith.constant 1024 : i32
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.muli %0, %c1024_i32 : i32
14
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
15
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
16
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
17
+ %5 = arith.divsi %4, %cst : tensor<1024xi32, #blocked>
18
+ %6 = arith.remsi %5, %cst_0 : tensor<1024xi32, #blocked>
19
+ %7 = arith.remsi %4, %cst : tensor<1024xi32, #blocked>
20
+ %8 = arith.divsi %4, %cst_1 : tensor<1024xi32, #blocked>
21
+ %9 = arith.muli %8, %cst : tensor<1024xi32, #blocked>
22
+ %10 = arith.addi %7, %9 : tensor<1024xi32, #blocked>
23
+ %11 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
24
+ %12 = tt.addptr %11, %10 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
25
+ %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
26
+ %14 = arith.extf %13 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
27
+ %15 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
28
+ %16 = tt.addptr %15, %10 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
29
+ %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
30
+ %18 = arith.extf %17 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
31
+ %19 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
32
+ %20 = tt.addptr %19, %10 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
33
+ %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
34
+ %22 = arith.extf %21 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
35
+ %23 = arith.cmpi eq, %6, %cst_2 : tensor<1024xi32, #blocked>
36
+ %24 = arith.select %23, %14, %cst_5 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked>
37
+ %25 = arith.cmpi eq, %6, %cst_4 : tensor<1024xi32, #blocked>
38
+ %26 = arith.select %25, %18, %cst_5 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked>
39
+ %27 = arith.addf %24, %26 : tensor<1024xf32, #blocked>
40
+ %28 = arith.cmpi eq, %6, %cst_3 : tensor<1024xi32, #blocked>
41
+ %29 = arith.select %28, %22, %cst_5 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked>
42
+ %30 = arith.addf %27, %29 : tensor<1024xf32, #blocked>
43
+ %31 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
44
+ %32 = tt.addptr %31, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
45
+ %33 = arith.truncf %30 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
46
+ tt.store %32, %33 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
47
+ tt.return
48
+ }
49
+ }
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ttir ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0> : tensor<1024xi32>
4
+ %cst_0 = arith.constant dense<1> : tensor<1024xi32>
5
+ %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024xf32>
6
+ %cst_2 = arith.constant dense<2> : tensor<1024xi32>
7
+ %cst_3 = arith.constant dense<768> : tensor<1024xi32>
8
+ %cst_4 = arith.constant dense<3> : tensor<1024xi32>
9
+ %cst_5 = arith.constant dense<256> : tensor<1024xi32>
10
+ %c1024_i32 = arith.constant 1024 : i32
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = arith.muli %0, %c1024_i32 : i32
13
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
14
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
15
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
16
+ %5 = arith.divsi %4, %cst_5 : tensor<1024xi32>
17
+ %6 = arith.remsi %5, %cst_4 : tensor<1024xi32>
18
+ %7 = arith.remsi %4, %cst_5 : tensor<1024xi32>
19
+ %8 = arith.divsi %4, %cst_3 : tensor<1024xi32>
20
+ %9 = arith.muli %8, %cst_5 : tensor<1024xi32>
21
+ %10 = arith.addi %7, %9 : tensor<1024xi32>
22
+ %11 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
23
+ %12 = tt.addptr %11, %10 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
24
+ %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16>
25
+ %14 = arith.extf %13 : tensor<1024xbf16> to tensor<1024xf32>
26
+ %15 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
27
+ %16 = tt.addptr %15, %10 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
28
+ %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16>
29
+ %18 = arith.extf %17 : tensor<1024xbf16> to tensor<1024xf32>
30
+ %19 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
31
+ %20 = tt.addptr %19, %10 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
32
+ %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1024xbf16>
33
+ %22 = arith.extf %21 : tensor<1024xbf16> to tensor<1024xf32>
34
+ %23 = arith.cmpi eq, %6, %cst_2 : tensor<1024xi32>
35
+ %24 = arith.select %23, %14, %cst_1 : tensor<1024xi1>, tensor<1024xf32>
36
+ %25 = arith.cmpi eq, %6, %cst_0 : tensor<1024xi32>
37
+ %26 = arith.select %25, %18, %cst_1 : tensor<1024xi1>, tensor<1024xf32>
38
+ %27 = arith.addf %24, %26 : tensor<1024xf32>
39
+ %28 = arith.cmpi eq, %6, %cst : tensor<1024xi32>
40
+ %29 = arith.select %28, %22, %cst_1 : tensor<1024xi1>, tensor<1024xf32>
41
+ %30 = arith.addf %27, %29 : tensor<1024xf32>
42
+ %31 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
43
+ %32 = tt.addptr %31, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
44
+ %33 = arith.truncf %30 : tensor<1024xf32> to tensor<1024xbf16>
45
+ tt.store %32, %33 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
46
+ tt.return
47
+ }
48
+ }
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.llir ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
5
+
6
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
7
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
8
+ %5 = shl i32 %4, 1, !dbg !10
9
+ %6 = and i32 %5, 510, !dbg !10
10
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
11
+ %8 = shl i32 %7, 9, !dbg !12
12
+ %9 = or i32 %8, %6, !dbg !13
13
+ %10 = sext i32 %9 to i64, !dbg !14
14
+ %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
15
+ %12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
16
+ %13 = trunc i32 %12 to i16, !dbg !15
17
+ %extelt.offset = lshr i32 %12, 16, !dbg !15
18
+ %14 = trunc i32 %extelt.offset to i16, !dbg !15
19
+ %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #4, !dbg !16
20
+ %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #4, !dbg !16
21
+ %17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !17
22
+ %18 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %17, i1 true) #4, !dbg !18
23
+ %19 = trunc i32 %18 to i16, !dbg !18
24
+ %extelt.offset1 = lshr i32 %18, 16, !dbg !18
25
+ %20 = trunc i32 %extelt.offset1 to i16, !dbg !18
26
+ %21 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !19
27
+ %22 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !19
28
+ %23 = fmul float %21, 0x3FE6A09E60000000, !dbg !20
29
+ %24 = fmul float %22, 0x3FE6A09E60000000, !dbg !20
30
+ %25 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
31
+ %.not.i = icmp eq i32 %25, 0, !dbg !21
32
+ %26 = tail call float @llvm.nvvm.fabs.ftz.f(float %23) #4, !dbg !21
33
+ %27 = tail call float @llvm.nvvm.fabs.f(float %23) #4, !dbg !21
34
+ %.0.i = select i1 %.not.i, float %27, float %26, !dbg !21
35
+ %28 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !21
36
+ br i1 %28, label %__nv_fabsf.exit1.i, label %30, !dbg !21
37
+
38
+ __nv_fabsf.exit1.i: ; preds = %3
39
+ %29 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
40
+ %.not1.i = icmp eq i32 %29, 0, !dbg !21
41
+ %.01.i = select i1 %.not1.i, float %27, float %26, !dbg !21
42
+ br label %__internal_fmad.exit.i, !dbg !21
43
+
44
+ 30: ; preds = %3
45
+ %31 = fmul float %23, %23, !dbg !21
46
+ br label %__internal_fmad.exit.i, !dbg !21
47
+
48
+ __internal_fmad.exit.i: ; preds = %30, %__nv_fabsf.exit1.i
49
+ %32 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %30 ], !dbg !21
50
+ %33 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %30 ], !dbg !21
51
+ %34 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %30 ], !dbg !21
52
+ %35 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %30 ], !dbg !21
53
+ %36 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %30 ], !dbg !21
54
+ %37 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %30 ], !dbg !21
55
+ %38 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %30 ], !dbg !21
56
+ %39 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %31, %30 ], !dbg !21
57
+ %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
58
+ %.not2.i = icmp eq i32 %40, 0, !dbg !21
59
+ %41 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %38, float %39, float %37) #4, !dbg !21
60
+ %42 = tail call float @llvm.nvvm.fma.rn.f(float %38, float %39, float %37) #4, !dbg !21
61
+ %.02.i = select i1 %.not2.i, float %42, float %41, !dbg !21
62
+ %43 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
63
+ %.not3.i = icmp eq i32 %43, 0, !dbg !21
64
+ %44 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %39, float %36) #4, !dbg !21
65
+ %45 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %39, float %36) #4, !dbg !21
66
+ %.03.i = select i1 %.not3.i, float %45, float %44, !dbg !21
67
+ %46 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
68
+ %.not4.i = icmp eq i32 %46, 0, !dbg !21
69
+ %47 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %39, float %35) #4, !dbg !21
70
+ %48 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %39, float %35) #4, !dbg !21
71
+ %.04.i = select i1 %.not4.i, float %48, float %47, !dbg !21
72
+ %49 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
73
+ %.not5.i = icmp eq i32 %49, 0, !dbg !21
74
+ %50 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %39, float %34) #4, !dbg !21
75
+ %51 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %39, float %34) #4, !dbg !21
76
+ %.05.i = select i1 %.not5.i, float %51, float %50, !dbg !21
77
+ %52 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
78
+ %.not6.i = icmp eq i32 %52, 0, !dbg !21
79
+ %53 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %39, float %33) #4, !dbg !21
80
+ %54 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %39, float %33) #4, !dbg !21
81
+ %.06.i = select i1 %.not6.i, float %54, float %53, !dbg !21
82
+ %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
83
+ %.not7.i = icmp eq i32 %55, 0, !dbg !21
84
+ %56 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %39, float %32) #4, !dbg !21
85
+ %57 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %39, float %32) #4, !dbg !21
86
+ %.07.i = select i1 %.not7.i, float %57, float %56, !dbg !21
87
+ %58 = fneg float %39, !dbg !21
88
+ %59 = select i1 %28, float %58, float %23, !dbg !21
89
+ %60 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
90
+ %.not8.i = icmp eq i32 %60, 0, !dbg !21
91
+ %61 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %59, float %59) #4, !dbg !21
92
+ %62 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %59, float %59) #4, !dbg !21
93
+ %.08.i = select i1 %.not8.i, float %62, float %61, !dbg !21
94
+ br i1 %28, label %63, label %__nv_erff.exit, !dbg !21
95
+
96
+ 63: ; preds = %__internal_fmad.exit.i
97
+ %64 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !21
98
+ %65 = fsub float 1.000000e+00, %64, !dbg !21
99
+ %66 = bitcast float %65 to i32, !dbg !21
100
+ %67 = bitcast float %23 to i32, !dbg !21
101
+ %68 = and i32 %67, -2147483648, !dbg !21
102
+ %69 = or i32 %68, %66, !dbg !21
103
+ %70 = bitcast i32 %69 to float, !dbg !21
104
+ br label %__nv_erff.exit, !dbg !21
105
+
106
+ __nv_erff.exit: ; preds = %__internal_fmad.exit.i, %63
107
+ %r.0.i = phi float [ %70, %63 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !21
108
+ %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
109
+ %.not.i2 = icmp eq i32 %71, 0, !dbg !21
110
+ %72 = tail call float @llvm.nvvm.fabs.ftz.f(float %24) #4, !dbg !21
111
+ %73 = tail call float @llvm.nvvm.fabs.f(float %24) #4, !dbg !21
112
+ %.0.i3 = select i1 %.not.i2, float %73, float %72, !dbg !21
113
+ %74 = fcmp oge float %.0.i3, 0x3FF00C1FC0000000, !dbg !21
114
+ br i1 %74, label %__nv_fabsf.exit1.i20, label %76, !dbg !21
115
+
116
+ __nv_fabsf.exit1.i20: ; preds = %__nv_erff.exit
117
+ %75 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
118
+ %.not1.i21 = icmp eq i32 %75, 0, !dbg !21
119
+ %.01.i22 = select i1 %.not1.i21, float %73, float %72, !dbg !21
120
+ br label %__internal_fmad.exit.i4, !dbg !21
121
+
122
+ 76: ; preds = %__nv_erff.exit
123
+ %77 = fmul float %24, %24, !dbg !21
124
+ br label %__internal_fmad.exit.i4, !dbg !21
125
+
126
+ __internal_fmad.exit.i4: ; preds = %76, %__nv_fabsf.exit1.i20
127
+ %78 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i20 ], [ 0x3FC06EBA60000000, %76 ], !dbg !21
128
+ %79 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i20 ], [ 0xBFD8127580000000, %76 ], !dbg !21
129
+ %80 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i20 ], [ 0x3FBCE315E0000000, %76 ], !dbg !21
130
+ %81 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i20 ], [ 0xBF9B837CE0000000, %76 ], !dbg !21
131
+ %82 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i20 ], [ 0x3F755ABD40000000, %76 ], !dbg !21
132
+ %83 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i20 ], [ 0xBF4AE9A400000000, %76 ], !dbg !21
133
+ %84 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i20 ], [ 0x3F163D2D40000000, %76 ], !dbg !21
134
+ %85 = phi float [ %.01.i22, %__nv_fabsf.exit1.i20 ], [ %77, %76 ], !dbg !21
135
+ %86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
136
+ %.not2.i5 = icmp eq i32 %86, 0, !dbg !21
137
+ %87 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %84, float %85, float %83) #4, !dbg !21
138
+ %88 = tail call float @llvm.nvvm.fma.rn.f(float %84, float %85, float %83) #4, !dbg !21
139
+ %.02.i6 = select i1 %.not2.i5, float %88, float %87, !dbg !21
140
+ %89 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
141
+ %.not3.i7 = icmp eq i32 %89, 0, !dbg !21
142
+ %90 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i6, float %85, float %82) #4, !dbg !21
143
+ %91 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i6, float %85, float %82) #4, !dbg !21
144
+ %.03.i8 = select i1 %.not3.i7, float %91, float %90, !dbg !21
145
+ %92 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
146
+ %.not4.i9 = icmp eq i32 %92, 0, !dbg !21
147
+ %93 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i8, float %85, float %81) #4, !dbg !21
148
+ %94 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i8, float %85, float %81) #4, !dbg !21
149
+ %.04.i10 = select i1 %.not4.i9, float %94, float %93, !dbg !21
150
+ %95 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
151
+ %.not5.i11 = icmp eq i32 %95, 0, !dbg !21
152
+ %96 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i10, float %85, float %80) #4, !dbg !21
153
+ %97 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i10, float %85, float %80) #4, !dbg !21
154
+ %.05.i12 = select i1 %.not5.i11, float %97, float %96, !dbg !21
155
+ %98 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
156
+ %.not6.i13 = icmp eq i32 %98, 0, !dbg !21
157
+ %99 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i12, float %85, float %79) #4, !dbg !21
158
+ %100 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i12, float %85, float %79) #4, !dbg !21
159
+ %.06.i14 = select i1 %.not6.i13, float %100, float %99, !dbg !21
160
+ %101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
161
+ %.not7.i15 = icmp eq i32 %101, 0, !dbg !21
162
+ %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i14, float %85, float %78) #4, !dbg !21
163
+ %103 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i14, float %85, float %78) #4, !dbg !21
164
+ %.07.i16 = select i1 %.not7.i15, float %103, float %102, !dbg !21
165
+ %104 = fneg float %85, !dbg !21
166
+ %105 = select i1 %74, float %104, float %24, !dbg !21
167
+ %106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
168
+ %.not8.i17 = icmp eq i32 %106, 0, !dbg !21
169
+ %107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i16, float %105, float %105) #4, !dbg !21
170
+ %108 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i16, float %105, float %105) #4, !dbg !21
171
+ %.08.i18 = select i1 %.not8.i17, float %108, float %107, !dbg !21
172
+ br i1 %74, label %109, label %__nv_erff.exit23, !dbg !21
173
+
174
+ 109: ; preds = %__internal_fmad.exit.i4
175
+ %110 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i18) #4, !dbg !21
176
+ %111 = fsub float 1.000000e+00, %110, !dbg !21
177
+ %112 = bitcast float %111 to i32, !dbg !21
178
+ %113 = bitcast float %24 to i32, !dbg !21
179
+ %114 = and i32 %113, -2147483648, !dbg !21
180
+ %115 = or i32 %114, %112, !dbg !21
181
+ %116 = bitcast i32 %115 to float, !dbg !21
182
+ br label %__nv_erff.exit23, !dbg !21
183
+
184
+ __nv_erff.exit23: ; preds = %__internal_fmad.exit.i4, %109
185
+ %r.0.i19 = phi float [ %116, %109 ], [ %.08.i18, %__internal_fmad.exit.i4 ], !dbg !21
186
+ %117 = fadd float %r.0.i, 1.000000e+00, !dbg !22
187
+ %118 = fadd float %r.0.i19, 1.000000e+00, !dbg !22
188
+ %119 = fmul float %117, 5.000000e-01, !dbg !23
189
+ %120 = fmul float %118, 5.000000e-01, !dbg !23
190
+ %121 = fmul float %21, %21, !dbg !24
191
+ %122 = fmul float %22, %22, !dbg !24
192
+ %123 = fmul float %121, -5.000000e-01, !dbg !25
193
+ %124 = fmul float %122, -5.000000e-01, !dbg !25
194
+ %125 = fmul float %123, 0x3FF7154760000000, !dbg !26
195
+ %126 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %125) #4, !dbg !26
196
+ %127 = fmul float %124, 0x3FF7154760000000, !dbg !26
197
+ %128 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %127) #4, !dbg !26
198
+ %129 = fmul float %126, 0x3FD9884540000000, !dbg !27
199
+ %130 = fmul float %128, 0x3FD9884540000000, !dbg !27
200
+ %131 = fmul float %21, %129, !dbg !28
201
+ %132 = fmul float %22, %130, !dbg !28
202
+ %133 = fadd float %119, %131, !dbg !29
203
+ %134 = fadd float %120, %132, !dbg !29
204
+ %135 = fmul float %15, %133, !dbg !30
205
+ %136 = fmul float %16, %134, !dbg !30
206
+ %137 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %135) #4, !dbg !31
207
+ %138 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %136) #4, !dbg !31
208
+ %139 = insertelement <2 x i16> undef, i16 %137, i64 0, !dbg !31
209
+ %140 = insertelement <2 x i16> %139, i16 %138, i64 1, !dbg !31
210
+ %141 = bitcast <2 x i16> %140 to i32, !dbg !31
211
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %141, ptr addrspace(1) %11, i1 true) #4, !dbg !31
212
+ ret void, !dbg !32
213
+ }
214
+
215
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
216
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
217
+
218
+ ; Function Attrs: alwaysinline nounwind
219
+ define float @__nv_erff(float %a) local_unnamed_addr #1 {
220
+ __nv_fabsf.exit:
221
+ %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
222
+ %.not = icmp eq i32 %0, 0
223
+ %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
224
+ %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
225
+ %.0 = select i1 %.not, float %2, float %1
226
+ %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
227
+ br i1 %3, label %__nv_fabsf.exit1, label %5
228
+
229
+ __nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
230
+ %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
231
+ %.not1 = icmp eq i32 %4, 0
232
+ %.01 = select i1 %.not1, float %2, float %1
233
+ br label %__internal_fmad.exit
234
+
235
+ 5: ; preds = %__nv_fabsf.exit
236
+ %6 = fmul float %a, %a
237
+ br label %__internal_fmad.exit
238
+
239
+ __internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
240
+ %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
241
+ %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
242
+ %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
243
+ %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
244
+ %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
245
+ %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
246
+ %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
247
+ %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
248
+ %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
249
+ %.not2 = icmp eq i32 %15, 0
250
+ %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
251
+ %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
252
+ %.02 = select i1 %.not2, float %17, float %16
253
+ %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
254
+ %.not3 = icmp eq i32 %18, 0
255
+ %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
256
+ %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
257
+ %.03 = select i1 %.not3, float %20, float %19
258
+ %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
259
+ %.not4 = icmp eq i32 %21, 0
260
+ %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
261
+ %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
262
+ %.04 = select i1 %.not4, float %23, float %22
263
+ %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
264
+ %.not5 = icmp eq i32 %24, 0
265
+ %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
266
+ %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
267
+ %.05 = select i1 %.not5, float %26, float %25
268
+ %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
269
+ %.not6 = icmp eq i32 %27, 0
270
+ %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
271
+ %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
272
+ %.06 = select i1 %.not6, float %29, float %28
273
+ %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
274
+ %.not7 = icmp eq i32 %30, 0
275
+ %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
276
+ %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
277
+ %.07 = select i1 %.not7, float %32, float %31
278
+ %33 = fneg float %14
279
+ %34 = select i1 %3, float %33, float %a
280
+ %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
281
+ %.not8 = icmp eq i32 %35, 0
282
+ %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
283
+ %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
284
+ %.08 = select i1 %.not8, float %37, float %36
285
+ br i1 %3, label %38, label %46
286
+
287
+ 38: ; preds = %__internal_fmad.exit
288
+ %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
289
+ %40 = fsub float 1.000000e+00, %39
290
+ %41 = bitcast float %40 to i32
291
+ %42 = bitcast float %a to i32
292
+ %43 = and i32 %42, -2147483648
293
+ %44 = or i32 %43, %41
294
+ %45 = bitcast i32 %44 to float
295
+ br label %46
296
+
297
+ 46: ; preds = %38, %__internal_fmad.exit
298
+ %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
299
+ ret float %r.0
300
+ }
301
+
302
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
303
+
304
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
305
+ declare float @llvm.nvvm.fabs.ftz.f(float) #0
306
+
307
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
308
+ declare float @llvm.nvvm.fabs.f(float) #0
309
+
310
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
311
+ declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
312
+
313
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
314
+ declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
315
+
316
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
317
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
318
+
319
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
320
+ attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
321
+ attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
322
+ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
323
+ attributes #4 = { nounwind }
324
+
325
+ !llvm.module.flags = !{!0, !1}
326
+ !llvm.dbg.cu = !{!2}
327
+ !nvvm.annotations = !{!4, !5, !5, !4}
328
+ !llvm.ident = !{!6}
329
+
330
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
331
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
332
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
333
+ !3 = !DIFile(filename: "c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py", directory: "/tmp/torchinductor_root/5j")
334
+ !4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
335
+ !5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
336
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
337
+ !7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
338
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
339
+ !9 = !{}
340
+ !10 = !DILocation(line: 21, column: 36, scope: !7)
341
+ !11 = !DILocation(line: 20, column: 28, scope: !7)
342
+ !12 = !DILocation(line: 20, column: 33, scope: !7)
343
+ !13 = !DILocation(line: 21, column: 23, scope: !7)
344
+ !14 = !DILocation(line: 24, column: 34, scope: !7)
345
+ !15 = !DILocation(line: 24, column: 39, scope: !7)
346
+ !16 = !DILocation(line: 24, column: 48, scope: !7)
347
+ !17 = !DILocation(line: 25, column: 30, scope: !7)
348
+ !18 = !DILocation(line: 25, column: 35, scope: !7)
349
+ !19 = !DILocation(line: 25, column: 44, scope: !7)
350
+ !20 = !DILocation(line: 29, column: 18, scope: !7)
351
+ !21 = !DILocation(line: 30, column: 23, scope: !7)
352
+ !22 = !DILocation(line: 32, column: 18, scope: !7)
353
+ !23 = !DILocation(line: 34, column: 19, scope: !7)
354
+ !24 = !DILocation(line: 35, column: 19, scope: !7)
355
+ !25 = !DILocation(line: 37, column: 20, scope: !7)
356
+ !26 = !DILocation(line: 38, column: 19, scope: !7)
357
+ !27 = !DILocation(line: 40, column: 20, scope: !7)
358
+ !28 = !DILocation(line: 41, column: 19, scope: !7)
359
+ !29 = !DILocation(line: 42, column: 20, scope: !7)
360
+ !30 = !DILocation(line: 43, column: 19, scope: !7)
361
+ !31 = !DILocation(line: 45, column: 40, scope: !7)
362
+ !32 = !DILocation(line: 45, column: 4, scope: !7)
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !5 {
7
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %10 = and i32 %9, 31, !dbg !8
9
+ %11 = lshr i32 %9, 5, !dbg !8
10
+ %12 = and i32 %11, 1, !dbg !8
11
+ %urem = shl i32 %9, 2, !dbg !8
12
+ %13 = and i32 %urem, 252, !dbg !8
13
+ %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
14
+ %15 = shl i32 %14, 8, !dbg !10
15
+ %16 = or i32 %15, %13, !dbg !11
16
+ %17 = sext i32 %16 to i64, !dbg !12
17
+ %18 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !12
18
+ %19 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
19
+ %20 = extractvalue { i32, i32 } %19, 0, !dbg !13
20
+ %21 = extractvalue { i32, i32 } %19, 1, !dbg !13
21
+ %22 = trunc i32 %20 to i16, !dbg !13
22
+ %extelt.offset = lshr i32 %20, 16, !dbg !13
23
+ %23 = trunc i32 %extelt.offset to i16, !dbg !13
24
+ %24 = trunc i32 %21 to i16, !dbg !13
25
+ %extelt.offset1 = lshr i32 %21, 16, !dbg !13
26
+ %25 = trunc i32 %extelt.offset1 to i16, !dbg !13
27
+ %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #3, !dbg !14
28
+ %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14
29
+ %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14
30
+ %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14
31
+ %30 = zext nneg i32 %13 to i64, !dbg !15
32
+ %31 = getelementptr float, ptr addrspace(1) %2, i64 %30, !dbg !15
33
+ %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %31, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
34
+ %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !16
35
+ %34 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !16
36
+ %35 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !16
37
+ %36 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !16
38
+ %37 = bitcast i32 %33 to float, !dbg !16
39
+ %38 = bitcast i32 %34 to float, !dbg !16
40
+ %39 = bitcast i32 %35 to float, !dbg !16
41
+ %40 = bitcast i32 %36 to float, !dbg !16
42
+ %41 = getelementptr float, ptr addrspace(1) %3, i64 %17, !dbg !17
43
+ %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
44
+ %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !18
45
+ %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !18
46
+ %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !18
47
+ %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !18
48
+ %47 = bitcast i32 %43 to float, !dbg !18
49
+ %48 = bitcast i32 %44 to float, !dbg !18
50
+ %49 = bitcast i32 %45 to float, !dbg !18
51
+ %50 = bitcast i32 %46 to float, !dbg !18
52
+ %51 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !19
53
+ %52 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %51, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
54
+ %53 = extractvalue { i32, i32, i32, i32 } %52, 0, !dbg !20
55
+ %54 = extractvalue { i32, i32, i32, i32 } %52, 1, !dbg !20
56
+ %55 = extractvalue { i32, i32, i32, i32 } %52, 2, !dbg !20
57
+ %56 = extractvalue { i32, i32, i32, i32 } %52, 3, !dbg !20
58
+ %57 = bitcast i32 %53 to float, !dbg !20
59
+ %58 = bitcast i32 %54 to float, !dbg !20
60
+ %59 = bitcast i32 %55 to float, !dbg !20
61
+ %60 = bitcast i32 %56 to float, !dbg !20
62
+ %61 = sext i32 %14 to i64, !dbg !21
63
+ %62 = getelementptr float, ptr addrspace(1) %4, i64 %61, !dbg !21
64
+ %63 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
65
+ %64 = bitcast i32 %63 to float, !dbg !22
66
+ %65 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
67
+ %66 = bitcast i32 %65 to float, !dbg !22
68
+ %67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
69
+ %68 = bitcast i32 %67 to float, !dbg !22
70
+ %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
71
+ %70 = bitcast i32 %69 to float, !dbg !22
72
+ %71 = fmul float %26, %37, !dbg !23
73
+ %72 = fmul float %27, %38, !dbg !23
74
+ %73 = fmul float %28, %39, !dbg !23
75
+ %74 = fmul float %29, %40, !dbg !23
76
+ %75 = fadd float %71, %72, !dbg !24
77
+ %76 = fadd float %73, %75, !dbg !24
78
+ %77 = fadd float %74, %76, !dbg !24
79
+ %78 = bitcast float %77 to i32, !dbg !30
80
+ %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 16, i32 31), !dbg !30
81
+ %80 = bitcast i32 %79 to float, !dbg !30
82
+ %81 = fadd float %77, %80, !dbg !24
83
+ %82 = bitcast float %81 to i32, !dbg !30
84
+ %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 8, i32 31), !dbg !30
85
+ %84 = bitcast i32 %83 to float, !dbg !30
86
+ %85 = fadd float %81, %84, !dbg !24
87
+ %86 = bitcast float %85 to i32, !dbg !30
88
+ %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 4, i32 31), !dbg !30
89
+ %88 = bitcast i32 %87 to float, !dbg !30
90
+ %89 = fadd float %85, %88, !dbg !24
91
+ %90 = bitcast float %89 to i32, !dbg !30
92
+ %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 2, i32 31), !dbg !30
93
+ %92 = bitcast i32 %91 to float, !dbg !30
94
+ %93 = fadd float %89, %92, !dbg !24
95
+ %94 = bitcast float %93 to i32, !dbg !30
96
+ %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 1, i32 31), !dbg !30
97
+ %96 = bitcast i32 %95 to float, !dbg !30
98
+ %97 = fadd float %93, %96, !dbg !24
99
+ %98 = icmp eq i32 %10, 0, !dbg !30
100
+ %99 = zext nneg i32 %12 to i64, !dbg !30
101
+ %100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !30
102
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %97, i1 %98) #3, !dbg !30
103
+ tail call void @llvm.nvvm.barrier0(), !dbg !30
104
+ %101 = icmp slt i32 %9, 2, !dbg !30
105
+ %102 = sext i32 %9 to i64, !dbg !30
106
+ %103 = getelementptr float, ptr addrspace(3) @global_smem, i64 %102, !dbg !30
107
+ %104 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !30
108
+ %105 = bitcast float %104 to i32, !dbg !30
109
+ %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 1, i32 31), !dbg !30
110
+ %107 = bitcast i32 %106 to float, !dbg !30
111
+ %108 = fadd float %104, %107, !dbg !24
112
+ %109 = and i32 %9, 1, !dbg !30
113
+ %110 = icmp eq i32 %109, 0, !dbg !30
114
+ %111 = and i1 %101, %110, !dbg !30
115
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %108, i1 %111) #3, !dbg !30
116
+ tail call void @llvm.nvvm.barrier0(), !dbg !30
117
+ %112 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !30
118
+ %113 = fadd float %112, 0.000000e+00, !dbg !32
119
+ %114 = fmul float %71, %47, !dbg !36
120
+ %115 = fmul float %72, %48, !dbg !36
121
+ %116 = fmul float %73, %49, !dbg !36
122
+ %117 = fmul float %74, %50, !dbg !36
123
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
124
+ %118 = fadd float %114, %115, !dbg !39
125
+ %119 = fadd float %116, %118, !dbg !39
126
+ %120 = fadd float %117, %119, !dbg !39
127
+ %121 = bitcast float %120 to i32, !dbg !37
128
+ %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !37
129
+ %123 = bitcast i32 %122 to float, !dbg !37
130
+ %124 = fadd float %120, %123, !dbg !39
131
+ %125 = bitcast float %124 to i32, !dbg !37
132
+ %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 8, i32 31), !dbg !37
133
+ %127 = bitcast i32 %126 to float, !dbg !37
134
+ %128 = fadd float %124, %127, !dbg !39
135
+ %129 = bitcast float %128 to i32, !dbg !37
136
+ %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 4, i32 31), !dbg !37
137
+ %131 = bitcast i32 %130 to float, !dbg !37
138
+ %132 = fadd float %128, %131, !dbg !39
139
+ %133 = bitcast float %132 to i32, !dbg !37
140
+ %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 2, i32 31), !dbg !37
141
+ %135 = bitcast i32 %134 to float, !dbg !37
142
+ %136 = fadd float %132, %135, !dbg !39
143
+ %137 = bitcast float %136 to i32, !dbg !37
144
+ %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 1, i32 31), !dbg !37
145
+ %139 = bitcast i32 %138 to float, !dbg !37
146
+ %140 = fadd float %136, %139, !dbg !39
147
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %140, i1 %98) #3, !dbg !37
148
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
149
+ %141 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !37
150
+ %142 = bitcast float %141 to i32, !dbg !37
151
+ %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 1, i32 31), !dbg !37
152
+ %144 = bitcast i32 %143 to float, !dbg !37
153
+ %145 = fadd float %141, %144, !dbg !39
154
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %145, i1 %111) #3, !dbg !37
155
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
156
+ %146 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37
157
+ %147 = fadd float %146, 0.000000e+00, !dbg !42
158
+ %148 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %64, float 2.560000e+02) #3, !dbg !44
159
+ %149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %66, float 2.560000e+02) #3, !dbg !44
160
+ %150 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %68, float 2.560000e+02) #3, !dbg !44
161
+ %151 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %70, float 2.560000e+02) #3, !dbg !44
162
+ %152 = fmul float %71, 2.560000e+02, !dbg !45
163
+ %153 = fmul float %72, 2.560000e+02, !dbg !45
164
+ %154 = fmul float %73, 2.560000e+02, !dbg !45
165
+ %155 = fmul float %74, 2.560000e+02, !dbg !45
166
+ %156 = fsub float %152, %113, !dbg !46
167
+ %157 = fsub float %153, %113, !dbg !46
168
+ %158 = fsub float %154, %113, !dbg !46
169
+ %159 = fsub float %155, %113, !dbg !46
170
+ %160 = fmul float %147, %47, !dbg !47
171
+ %161 = fmul float %147, %48, !dbg !47
172
+ %162 = fmul float %147, %49, !dbg !47
173
+ %163 = fmul float %147, %50, !dbg !47
174
+ %164 = fsub float %156, %160, !dbg !48
175
+ %165 = fsub float %157, %161, !dbg !48
176
+ %166 = fsub float %158, %162, !dbg !48
177
+ %167 = fsub float %159, %163, !dbg !48
178
+ %168 = fmul float %148, %164, !dbg !49
179
+ %169 = fmul float %148, %165, !dbg !49
180
+ %170 = fmul float %148, %166, !dbg !49
181
+ %171 = fmul float %148, %167, !dbg !49
182
+ %172 = fadd float %168, %57, !dbg !50
183
+ %173 = fadd float %169, %58, !dbg !50
184
+ %174 = fadd float %170, %59, !dbg !50
185
+ %175 = fadd float %171, %60, !dbg !50
186
+ %176 = bitcast float %172 to i32, !dbg !51
187
+ %177 = bitcast float %173 to i32, !dbg !51
188
+ %178 = bitcast float %174 to i32, !dbg !51
189
+ %179 = bitcast float %175 to i32, !dbg !51
190
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %176, i32 %177, i32 %178, i32 %179, ptr addrspace(1) %51, i1 true) #3, !dbg !51
191
+ %180 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !52
192
+ %181 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %172) #3, !dbg !53
193
+ %182 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %173) #3, !dbg !53
194
+ %183 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %174) #3, !dbg !53
195
+ %184 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %175) #3, !dbg !53
196
+ %185 = insertelement <2 x i16> undef, i16 %181, i64 0, !dbg !53
197
+ %186 = insertelement <2 x i16> %185, i16 %182, i64 1, !dbg !53
198
+ %187 = bitcast <2 x i16> %186 to i32, !dbg !53
199
+ %188 = insertelement <2 x i16> undef, i16 %183, i64 0, !dbg !53
200
+ %189 = insertelement <2 x i16> %188, i16 %184, i64 1, !dbg !53
201
+ %190 = bitcast <2 x i16> %189 to i32, !dbg !53
202
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %187, i32 %190, ptr addrspace(1) %180, i1 true) #3, !dbg !53
203
+ ret void, !dbg !54
204
+ }
205
+
206
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
207
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
208
+
209
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
210
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
211
+
212
+ ; Function Attrs: convergent nocallback nounwind
213
+ declare void @llvm.nvvm.barrier0() #2
214
+
215
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
216
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
217
+ attributes #2 = { convergent nocallback nounwind }
218
+ attributes #3 = { nounwind }
219
+
220
+ !llvm.module.flags = !{!0}
221
+ !llvm.dbg.cu = !{!1}
222
+ !nvvm.annotations = !{!3, !4, !4, !3}
223
+
224
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
225
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
226
+ !2 = !DIFile(filename: "crnynbmsd2yell2lpjymb46rttfaea2xjwsbxr75j54gctfgi457.py", directory: "/tmp/torchinductor_root/rn")
227
+ !3 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
228
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
229
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
230
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
231
+ !7 = !{}
232
+ !8 = !DILocation(line: 26, column: 26, scope: !5)
233
+ !9 = !DILocation(line: 23, column: 28, scope: !5)
234
+ !10 = !DILocation(line: 30, column: 40, scope: !5)
235
+ !11 = !DILocation(line: 30, column: 36, scope: !5)
236
+ !12 = !DILocation(line: 30, column: 30, scope: !5)
237
+ !13 = !DILocation(line: 30, column: 46, scope: !5)
238
+ !14 = !DILocation(line: 30, column: 67, scope: !5)
239
+ !15 = !DILocation(line: 31, column: 30, scope: !5)
240
+ !16 = !DILocation(line: 31, column: 35, scope: !5)
241
+ !17 = !DILocation(line: 32, column: 30, scope: !5)
242
+ !18 = !DILocation(line: 32, column: 46, scope: !5)
243
+ !19 = !DILocation(line: 33, column: 35, scope: !5)
244
+ !20 = !DILocation(line: 33, column: 51, scope: !5)
245
+ !21 = !DILocation(line: 34, column: 31, scope: !5)
246
+ !22 = !DILocation(line: 34, column: 36, scope: !5)
247
+ !23 = !DILocation(line: 36, column: 18, scope: !5)
248
+ !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !28)
249
+ !25 = distinct !DILexicalBlockFile(scope: !27, file: !26, discriminator: 0)
250
+ !26 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
251
+ !27 = distinct !DILexicalBlockFile(scope: !5, file: !26, discriminator: 0)
252
+ !28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29)
253
+ !29 = !DILocation(line: 39, column: 57, scope: !25)
254
+ !30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
255
+ !31 = !DILocation(line: 39, column: 57, scope: !27)
256
+ !32 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !35)
257
+ !33 = distinct !DILexicalBlockFile(scope: !5, file: !34, discriminator: 0)
258
+ !34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
259
+ !35 = !DILocation(line: 39, column: 44, scope: !33)
260
+ !36 = !DILocation(line: 40, column: 18, scope: !5)
261
+ !37 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !38)
262
+ !38 = !DILocation(line: 43, column: 59, scope: !27)
263
+ !39 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !40)
264
+ !40 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !41)
265
+ !41 = !DILocation(line: 43, column: 59, scope: !25)
266
+ !42 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !43)
267
+ !43 = !DILocation(line: 43, column: 45, scope: !33)
268
+ !44 = !DILocation(line: 45, column: 20, scope: !5)
269
+ !45 = !DILocation(line: 46, column: 19, scope: !5)
270
+ !46 = !DILocation(line: 47, column: 20, scope: !5)
271
+ !47 = !DILocation(line: 48, column: 19, scope: !5)
272
+ !48 = !DILocation(line: 49, column: 20, scope: !5)
273
+ !49 = !DILocation(line: 50, column: 20, scope: !5)
274
+ !50 = !DILocation(line: 51, column: 20, scope: !5)
275
+ !51 = !DILocation(line: 53, column: 51, scope: !5)
276
+ !52 = !DILocation(line: 54, column: 25, scope: !5)
277
+ !53 = !DILocation(line: 54, column: 48, scope: !5)
278
+ !54 = !DILocation(line: 54, column: 4, scope: !5)