Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir +366 -0
- .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx +807 -0
- .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir +76 -0
- .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir +57 -0
- .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin +0 -0
- .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx +764 -0
- .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir +26 -0
- .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir +25 -0
- .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin +0 -0
- .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir +290 -0
- .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx +653 -0
- .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir +60 -0
- .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir +53 -0
- .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir +24 -0
- .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin +0 -0
- .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir +63 -0
- .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx +577 -0
- .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir +65 -0
- .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin +0 -0
- .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx +743 -0
- .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir +72 -0
- .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin +0 -0
- .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir +132 -0
- .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir +304 -0
- .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir +62 -0
- .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir +61 -0
- .triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.llir +213 -0
- .triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx +495 -0
- .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx +486 -0
- .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir +38 -0
- .triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir +53 -0
- .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.cubin +0 -0
- .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ptx +709 -0
- .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir +66 -0
- .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin +0 -0
- .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir +15 -0
- .triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin +0 -0
- .triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir +38 -0
- .triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir +37 -0
- .triton/dump/962d1809855a53123762906133b1d960/triton_.ttir +17 -0
- .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir +368 -0
- .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir +127 -0
- .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir +333 -0
- .triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.cubin +0 -0
- .triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.llir +230 -0
- .triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ptx +296 -0
- .triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ptx +777 -0
- .triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ttgir +78 -0
- .triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttgir +100 -0
- .triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ttgir +16 -0
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir
ADDED
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
6 |
+
|
7 |
+
define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !7 {
|
8 |
+
%11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
9 |
+
%12 = and i32 %11, 31, !dbg !10
|
10 |
+
%13 = lshr i32 %11, 5, !dbg !10
|
11 |
+
%14 = and i32 %13, 1, !dbg !10
|
12 |
+
%urem = shl i32 %11, 2, !dbg !10
|
13 |
+
%15 = and i32 %urem, 252, !dbg !10
|
14 |
+
%16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
|
15 |
+
%17 = shl i32 %16, 8, !dbg !12
|
16 |
+
%18 = or i32 %17, %15, !dbg !13
|
17 |
+
%19 = sext i32 %18 to i64, !dbg !14
|
18 |
+
%20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !14
|
19 |
+
%21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
|
20 |
+
%22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !15
|
21 |
+
%23 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !15
|
22 |
+
%24 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !15
|
23 |
+
%25 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !15
|
24 |
+
%26 = bitcast i32 %22 to float, !dbg !15
|
25 |
+
%27 = bitcast i32 %23 to float, !dbg !15
|
26 |
+
%28 = bitcast i32 %24 to float, !dbg !15
|
27 |
+
%29 = bitcast i32 %25 to float, !dbg !15
|
28 |
+
%30 = getelementptr i16, ptr addrspace(1) %1, i64 %19, !dbg !16
|
29 |
+
%31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
|
30 |
+
%32 = extractvalue { i32, i32 } %31, 0, !dbg !17
|
31 |
+
%33 = extractvalue { i32, i32 } %31, 1, !dbg !17
|
32 |
+
%34 = trunc i32 %32 to i16, !dbg !17
|
33 |
+
%extelt.offset = lshr i32 %32, 16, !dbg !17
|
34 |
+
%35 = trunc i32 %extelt.offset to i16, !dbg !17
|
35 |
+
%36 = trunc i32 %33 to i16, !dbg !17
|
36 |
+
%extelt.offset1 = lshr i32 %33, 16, !dbg !17
|
37 |
+
%37 = trunc i32 %extelt.offset1 to i16, !dbg !17
|
38 |
+
%38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18
|
39 |
+
%39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #6, !dbg !18
|
40 |
+
%40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #6, !dbg !18
|
41 |
+
%41 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %37) #6, !dbg !18
|
42 |
+
%42 = getelementptr i16, ptr addrspace(1) %2, i64 %19, !dbg !19
|
43 |
+
%43 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
|
44 |
+
%44 = extractvalue { i32, i32 } %43, 0, !dbg !20
|
45 |
+
%45 = extractvalue { i32, i32 } %43, 1, !dbg !20
|
46 |
+
%46 = trunc i32 %44 to i16, !dbg !20
|
47 |
+
%extelt.offset2 = lshr i32 %44, 16, !dbg !20
|
48 |
+
%47 = trunc i32 %extelt.offset2 to i16, !dbg !20
|
49 |
+
%48 = trunc i32 %45 to i16, !dbg !20
|
50 |
+
%extelt.offset3 = lshr i32 %45, 16, !dbg !20
|
51 |
+
%49 = trunc i32 %extelt.offset3 to i16, !dbg !20
|
52 |
+
%50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !21
|
53 |
+
%51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !21
|
54 |
+
%52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #6, !dbg !21
|
55 |
+
%53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #6, !dbg !21
|
56 |
+
%54 = getelementptr i16, ptr addrspace(1) %3, i64 %19, !dbg !22
|
57 |
+
%55 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
|
58 |
+
%56 = extractvalue { i32, i32 } %55, 0, !dbg !23
|
59 |
+
%57 = extractvalue { i32, i32 } %55, 1, !dbg !23
|
60 |
+
%58 = trunc i32 %56 to i16, !dbg !23
|
61 |
+
%extelt.offset4 = lshr i32 %56, 16, !dbg !23
|
62 |
+
%59 = trunc i32 %extelt.offset4 to i16, !dbg !23
|
63 |
+
%60 = trunc i32 %57 to i16, !dbg !23
|
64 |
+
%extelt.offset5 = lshr i32 %57, 16, !dbg !23
|
65 |
+
%61 = trunc i32 %extelt.offset5 to i16, !dbg !23
|
66 |
+
%62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #6, !dbg !24
|
67 |
+
%63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #6, !dbg !24
|
68 |
+
%64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #6, !dbg !24
|
69 |
+
%65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %61) #6, !dbg !24
|
70 |
+
%66 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !25
|
71 |
+
%67 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %66, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
|
72 |
+
%68 = extractvalue { i32, i32 } %67, 0, !dbg !26
|
73 |
+
%69 = extractvalue { i32, i32 } %67, 1, !dbg !26
|
74 |
+
%70 = trunc i32 %68 to i16, !dbg !26
|
75 |
+
%extelt.offset6 = lshr i32 %68, 16, !dbg !26
|
76 |
+
%71 = trunc i32 %extelt.offset6 to i16, !dbg !26
|
77 |
+
%72 = trunc i32 %69 to i16, !dbg !26
|
78 |
+
%extelt.offset7 = lshr i32 %69, 16, !dbg !26
|
79 |
+
%73 = trunc i32 %extelt.offset7 to i16, !dbg !26
|
80 |
+
%74 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #6, !dbg !27
|
81 |
+
%75 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #6, !dbg !27
|
82 |
+
%76 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #6, !dbg !27
|
83 |
+
%77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #6, !dbg !27
|
84 |
+
%78 = zext nneg i32 %15 to i64, !dbg !28
|
85 |
+
%79 = getelementptr float, ptr addrspace(1) %5, i64 %78, !dbg !28
|
86 |
+
%80 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
|
87 |
+
%81 = fadd float %38, %26, !dbg !30
|
88 |
+
%82 = fadd float %39, %27, !dbg !30
|
89 |
+
%83 = fadd float %40, %28, !dbg !30
|
90 |
+
%84 = fadd float %81, %50, !dbg !31
|
91 |
+
%85 = fadd float %82, %51, !dbg !31
|
92 |
+
%86 = fadd float %83, %52, !dbg !31
|
93 |
+
%87 = fadd float %85, %63, !dbg !32
|
94 |
+
%88 = fadd float %86, %64, !dbg !32
|
95 |
+
%89 = fadd float %87, %75, !dbg !33
|
96 |
+
%90 = fadd float %88, %76, !dbg !33
|
97 |
+
%91 = insertelement <2 x float> poison, float %84, i64 0, !dbg !32
|
98 |
+
%92 = insertelement <2 x float> %91, float %41, i64 1, !dbg !32
|
99 |
+
%93 = insertelement <2 x float> poison, float %62, i64 0, !dbg !32
|
100 |
+
%94 = insertelement <2 x float> %93, float %29, i64 1, !dbg !32
|
101 |
+
%95 = fadd <2 x float> %92, %94, !dbg !32
|
102 |
+
%96 = insertelement <2 x float> poison, float %74, i64 0, !dbg !33
|
103 |
+
%97 = insertelement <2 x float> %96, float %53, i64 1, !dbg !33
|
104 |
+
%98 = fadd <2 x float> %95, %97, !dbg !33
|
105 |
+
%99 = insertelement <2 x float> poison, float %89, i64 0, !dbg !34
|
106 |
+
%100 = insertelement <2 x float> %99, float %65, i64 1, !dbg !34
|
107 |
+
%101 = fadd <2 x float> %98, %100, !dbg !34
|
108 |
+
%102 = insertelement <2 x float> poison, float %90, i64 0, !dbg !34
|
109 |
+
%103 = insertelement <2 x float> %102, float %77, i64 1, !dbg !34
|
110 |
+
%104 = fadd <2 x float> %101, %103, !dbg !34
|
111 |
+
%105 = extractelement <2 x float> %104, i64 0, !dbg !34
|
112 |
+
%106 = extractelement <2 x float> %104, i64 1, !dbg !34
|
113 |
+
%107 = fadd float %105, %106, !dbg !34
|
114 |
+
%108 = bitcast float %107 to i32, !dbg !40
|
115 |
+
%109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !40
|
116 |
+
%110 = bitcast i32 %109 to float, !dbg !40
|
117 |
+
%111 = fadd float %107, %110, !dbg !34
|
118 |
+
%112 = bitcast float %111 to i32, !dbg !40
|
119 |
+
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !40
|
120 |
+
%114 = bitcast i32 %113 to float, !dbg !40
|
121 |
+
%115 = fadd float %111, %114, !dbg !34
|
122 |
+
%116 = bitcast float %115 to i32, !dbg !40
|
123 |
+
%117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 4, i32 31), !dbg !40
|
124 |
+
%118 = bitcast i32 %117 to float, !dbg !40
|
125 |
+
%119 = fadd float %115, %118, !dbg !34
|
126 |
+
%120 = bitcast float %119 to i32, !dbg !40
|
127 |
+
%121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 2, i32 31), !dbg !40
|
128 |
+
%122 = bitcast i32 %121 to float, !dbg !40
|
129 |
+
%123 = fadd float %119, %122, !dbg !34
|
130 |
+
%124 = bitcast float %123 to i32, !dbg !40
|
131 |
+
%125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 1, i32 31), !dbg !40
|
132 |
+
%126 = bitcast i32 %125 to float, !dbg !40
|
133 |
+
%127 = fadd float %123, %126, !dbg !34
|
134 |
+
%128 = icmp eq i32 %12, 0, !dbg !40
|
135 |
+
%129 = zext nneg i32 %14 to i64, !dbg !40
|
136 |
+
%130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !40
|
137 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %127, i1 %128) #6, !dbg !40
|
138 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !40
|
139 |
+
%131 = icmp slt i32 %11, 2, !dbg !40
|
140 |
+
%132 = sext i32 %11 to i64, !dbg !40
|
141 |
+
%133 = getelementptr float, ptr addrspace(3) @global_smem, i64 %132, !dbg !40
|
142 |
+
%134 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !40
|
143 |
+
%135 = bitcast float %134 to i32, !dbg !40
|
144 |
+
%136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 1, i32 31), !dbg !40
|
145 |
+
%137 = bitcast i32 %136 to float, !dbg !40
|
146 |
+
%138 = fadd float %134, %137, !dbg !34
|
147 |
+
%139 = and i32 %11, 1, !dbg !40
|
148 |
+
%140 = icmp eq i32 %139, 0, !dbg !40
|
149 |
+
%141 = and i1 %131, %140, !dbg !40
|
150 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %138, i1 %141) #6, !dbg !40
|
151 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !40
|
152 |
+
%142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !40
|
153 |
+
%143 = fadd float %142, 0.000000e+00, !dbg !42
|
154 |
+
%144 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %143, float 2.560000e+02) #6, !dbg !46
|
155 |
+
%145 = extractelement <2 x float> %98, i64 0, !dbg !47
|
156 |
+
%146 = fsub float %145, %144, !dbg !47
|
157 |
+
%147 = fsub float %89, %144, !dbg !47
|
158 |
+
%148 = fsub float %90, %144, !dbg !47
|
159 |
+
%149 = fsub float %106, %144, !dbg !47
|
160 |
+
%150 = fmul float %146, %146, !dbg !48
|
161 |
+
%151 = fmul float %147, %147, !dbg !48
|
162 |
+
%152 = fmul float %148, %148, !dbg !48
|
163 |
+
%153 = fmul float %149, %149, !dbg !48
|
164 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !49
|
165 |
+
%154 = fadd float %150, %151, !dbg !51
|
166 |
+
%155 = fadd float %152, %154, !dbg !51
|
167 |
+
%156 = fadd float %153, %155, !dbg !51
|
168 |
+
%157 = bitcast float %156 to i32, !dbg !49
|
169 |
+
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !49
|
170 |
+
%159 = bitcast i32 %158 to float, !dbg !49
|
171 |
+
%160 = fadd float %156, %159, !dbg !51
|
172 |
+
%161 = bitcast float %160 to i32, !dbg !49
|
173 |
+
%162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !49
|
174 |
+
%163 = bitcast i32 %162 to float, !dbg !49
|
175 |
+
%164 = fadd float %160, %163, !dbg !51
|
176 |
+
%165 = bitcast float %164 to i32, !dbg !49
|
177 |
+
%166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 4, i32 31), !dbg !49
|
178 |
+
%167 = bitcast i32 %166 to float, !dbg !49
|
179 |
+
%168 = fadd float %164, %167, !dbg !51
|
180 |
+
%169 = bitcast float %168 to i32, !dbg !49
|
181 |
+
%170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 2, i32 31), !dbg !49
|
182 |
+
%171 = bitcast i32 %170 to float, !dbg !49
|
183 |
+
%172 = fadd float %168, %171, !dbg !51
|
184 |
+
%173 = bitcast float %172 to i32, !dbg !49
|
185 |
+
%174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 1, i32 31), !dbg !49
|
186 |
+
%175 = bitcast i32 %174 to float, !dbg !49
|
187 |
+
%176 = fadd float %172, %175, !dbg !51
|
188 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %176, i1 %128) #6, !dbg !49
|
189 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !49
|
190 |
+
%177 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !49
|
191 |
+
%178 = bitcast float %177 to i32, !dbg !49
|
192 |
+
%179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !49
|
193 |
+
%180 = bitcast i32 %179 to float, !dbg !49
|
194 |
+
%181 = fadd float %177, %180, !dbg !51
|
195 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %181, i1 %141) #6, !dbg !49
|
196 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !49
|
197 |
+
%182 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !49
|
198 |
+
%183 = fadd float %182, 0.000000e+00, !dbg !54
|
199 |
+
%184 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float 2.560000e+02) #6, !dbg !56
|
200 |
+
%185 = fadd float %184, 0x3EE4F8B580000000, !dbg !57
|
201 |
+
%186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58
|
202 |
+
%.not.i = icmp eq i32 %186, 0, !dbg !58
|
203 |
+
br i1 %.not.i, label %189, label %187, !dbg !58
|
204 |
+
|
205 |
+
187: ; preds = %10
|
206 |
+
%188 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %185), !dbg !58
|
207 |
+
br label %__nv_rsqrtf.exit, !dbg !58
|
208 |
+
|
209 |
+
189: ; preds = %10
|
210 |
+
%190 = tail call float @llvm.nvvm.rsqrt.approx.f(float %185), !dbg !58
|
211 |
+
br label %__nv_rsqrtf.exit, !dbg !58
|
212 |
+
|
213 |
+
__nv_rsqrtf.exit: ; preds = %187, %189
|
214 |
+
%.0.i = phi float [ %188, %187 ], [ %190, %189 ], !dbg !58
|
215 |
+
%191 = extractvalue { i32, i32, i32, i32 } %80, 3, !dbg !29
|
216 |
+
%192 = bitcast i32 %191 to float, !dbg !29
|
217 |
+
%193 = extractvalue { i32, i32, i32, i32 } %80, 2, !dbg !29
|
218 |
+
%194 = bitcast i32 %193 to float, !dbg !29
|
219 |
+
%195 = extractvalue { i32, i32, i32, i32 } %80, 1, !dbg !29
|
220 |
+
%196 = bitcast i32 %195 to float, !dbg !29
|
221 |
+
%197 = extractvalue { i32, i32, i32, i32 } %80, 0, !dbg !29
|
222 |
+
%198 = bitcast i32 %197 to float, !dbg !29
|
223 |
+
%199 = fmul float %146, %.0.i, !dbg !59
|
224 |
+
%200 = fmul float %147, %.0.i, !dbg !59
|
225 |
+
%201 = fmul float %148, %.0.i, !dbg !59
|
226 |
+
%202 = fmul float %149, %.0.i, !dbg !59
|
227 |
+
%203 = fmul float %199, %198, !dbg !60
|
228 |
+
%204 = fmul float %200, %196, !dbg !60
|
229 |
+
%205 = fmul float %201, %194, !dbg !60
|
230 |
+
%206 = fmul float %202, %192, !dbg !60
|
231 |
+
%207 = getelementptr float, ptr addrspace(1) %6, i64 %19, !dbg !61
|
232 |
+
%208 = bitcast float %145 to i32, !dbg !62
|
233 |
+
%209 = bitcast float %89 to i32, !dbg !62
|
234 |
+
%210 = bitcast float %90 to i32, !dbg !62
|
235 |
+
%211 = bitcast float %106 to i32, !dbg !62
|
236 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %207, i1 true) #6, !dbg !62
|
237 |
+
%212 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !63
|
238 |
+
%213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %203) #6, !dbg !64
|
239 |
+
%214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #6, !dbg !64
|
240 |
+
%215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #6, !dbg !64
|
241 |
+
%216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #6, !dbg !64
|
242 |
+
%217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !64
|
243 |
+
%218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !64
|
244 |
+
%219 = bitcast <2 x i16> %218 to i32, !dbg !64
|
245 |
+
%220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !64
|
246 |
+
%221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !64
|
247 |
+
%222 = bitcast <2 x i16> %221 to i32, !dbg !64
|
248 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #6, !dbg !64
|
249 |
+
ret void, !dbg !65
|
250 |
+
}
|
251 |
+
|
252 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
253 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
254 |
+
|
255 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
256 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
257 |
+
|
258 |
+
; Function Attrs: convergent nocallback nounwind
|
259 |
+
declare void @llvm.nvvm.barrier0() #2
|
260 |
+
|
261 |
+
; Function Attrs: alwaysinline nounwind
|
262 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
263 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
264 |
+
%.not = icmp eq i32 %1, 0
|
265 |
+
br i1 %.not, label %4, label %2
|
266 |
+
|
267 |
+
2: ; preds = %0
|
268 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
269 |
+
br label %6
|
270 |
+
|
271 |
+
4: ; preds = %0
|
272 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
273 |
+
br label %6
|
274 |
+
|
275 |
+
6: ; preds = %4, %2
|
276 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
277 |
+
ret float %.0
|
278 |
+
}
|
279 |
+
|
280 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
281 |
+
|
282 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
283 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
284 |
+
|
285 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
286 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
287 |
+
|
288 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
289 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
290 |
+
attributes #2 = { convergent nocallback nounwind }
|
291 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
292 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
293 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
294 |
+
attributes #6 = { nounwind }
|
295 |
+
|
296 |
+
!llvm.module.flags = !{!0, !1}
|
297 |
+
!llvm.dbg.cu = !{!2}
|
298 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
299 |
+
!llvm.ident = !{!6}
|
300 |
+
|
301 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
302 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
303 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
304 |
+
!3 = !DIFile(filename: "cybxnh26qvsbmxmvdr54vaav2ezk2qxu7562fhhsn4lvyvqgoglw.py", directory: "/tmp/torchinductor_root/yb")
|
305 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1}
|
306 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64}
|
307 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
308 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
309 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
310 |
+
!9 = !{}
|
311 |
+
!10 = !DILocation(line: 26, column: 26, scope: !7)
|
312 |
+
!11 = !DILocation(line: 23, column: 28, scope: !7)
|
313 |
+
!12 = !DILocation(line: 30, column: 40, scope: !7)
|
314 |
+
!13 = !DILocation(line: 30, column: 36, scope: !7)
|
315 |
+
!14 = !DILocation(line: 30, column: 30, scope: !7)
|
316 |
+
!15 = !DILocation(line: 30, column: 46, scope: !7)
|
317 |
+
!16 = !DILocation(line: 31, column: 30, scope: !7)
|
318 |
+
!17 = !DILocation(line: 31, column: 46, scope: !7)
|
319 |
+
!18 = !DILocation(line: 31, column: 67, scope: !7)
|
320 |
+
!19 = !DILocation(line: 32, column: 30, scope: !7)
|
321 |
+
!20 = !DILocation(line: 32, column: 46, scope: !7)
|
322 |
+
!21 = !DILocation(line: 32, column: 67, scope: !7)
|
323 |
+
!22 = !DILocation(line: 33, column: 30, scope: !7)
|
324 |
+
!23 = !DILocation(line: 33, column: 46, scope: !7)
|
325 |
+
!24 = !DILocation(line: 33, column: 67, scope: !7)
|
326 |
+
!25 = !DILocation(line: 34, column: 31, scope: !7)
|
327 |
+
!26 = !DILocation(line: 34, column: 47, scope: !7)
|
328 |
+
!27 = !DILocation(line: 34, column: 68, scope: !7)
|
329 |
+
!28 = !DILocation(line: 35, column: 31, scope: !7)
|
330 |
+
!29 = !DILocation(line: 35, column: 36, scope: !7)
|
331 |
+
!30 = !DILocation(line: 37, column: 18, scope: !7)
|
332 |
+
!31 = !DILocation(line: 39, column: 18, scope: !7)
|
333 |
+
!32 = !DILocation(line: 41, column: 18, scope: !7)
|
334 |
+
!33 = !DILocation(line: 43, column: 19, scope: !7)
|
335 |
+
!34 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !38)
|
336 |
+
!35 = distinct !DILexicalBlockFile(scope: !37, file: !36, discriminator: 0)
|
337 |
+
!36 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
338 |
+
!37 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
|
339 |
+
!38 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !39)
|
340 |
+
!39 = !DILocation(line: 48, column: 59, scope: !35)
|
341 |
+
!40 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !41)
|
342 |
+
!41 = !DILocation(line: 48, column: 59, scope: !37)
|
343 |
+
!42 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !45)
|
344 |
+
!43 = distinct !DILexicalBlockFile(scope: !7, file: !44, discriminator: 0)
|
345 |
+
!44 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
346 |
+
!45 = !DILocation(line: 48, column: 45, scope: !43)
|
347 |
+
!46 = !DILocation(line: 51, column: 20, scope: !7)
|
348 |
+
!47 = !DILocation(line: 52, column: 20, scope: !7)
|
349 |
+
!48 = !DILocation(line: 53, column: 20, scope: !7)
|
350 |
+
!49 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !50)
|
351 |
+
!50 = !DILocation(line: 56, column: 59, scope: !37)
|
352 |
+
!51 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !52)
|
353 |
+
!52 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !53)
|
354 |
+
!53 = !DILocation(line: 56, column: 59, scope: !35)
|
355 |
+
!54 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !55)
|
356 |
+
!55 = !DILocation(line: 56, column: 45, scope: !43)
|
357 |
+
!56 = !DILocation(line: 59, column: 20, scope: !7)
|
358 |
+
!57 = !DILocation(line: 61, column: 20, scope: !7)
|
359 |
+
!58 = !DILocation(line: 62, column: 26, scope: !7)
|
360 |
+
!59 = !DILocation(line: 63, column: 20, scope: !7)
|
361 |
+
!60 = !DILocation(line: 64, column: 20, scope: !7)
|
362 |
+
!61 = !DILocation(line: 66, column: 25, scope: !7)
|
363 |
+
!62 = !DILocation(line: 66, column: 48, scope: !7)
|
364 |
+
!63 = !DILocation(line: 67, column: 25, scope: !7)
|
365 |
+
!64 = !DILocation(line: 67, column: 48, scope: !7)
|
366 |
+
!65 = !DILocation(line: 67, column: 4, scope: !7)
|
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx
ADDED
@@ -0,0 +1,807 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7d8de9de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
12 |
+
|
13 |
+
.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
|
21 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
|
22 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
|
23 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
|
24 |
+
)
|
25 |
+
.maxntid 64, 1, 1
|
26 |
+
{
|
27 |
+
.reg .pred %p<33>;
|
28 |
+
.reg .b16 %rs<21>;
|
29 |
+
.reg .b32 %r<112>;
|
30 |
+
.reg .f32 %f<94>;
|
31 |
+
.reg .b64 %rd<20>;
|
32 |
+
.loc 1 18 0
|
33 |
+
$L__func_begin0:
|
34 |
+
.loc 1 18 0
|
35 |
+
|
36 |
+
ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
|
37 |
+
ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
|
38 |
+
$L__tmp0:
|
39 |
+
.loc 1 26 26
|
40 |
+
mov.u32 %r78, %tid.x;
|
41 |
+
and.b32 %r79, %r78, 31;
|
42 |
+
ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
|
43 |
+
ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
|
44 |
+
ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
|
45 |
+
shl.b32 %r80, %r78, 2;
|
46 |
+
ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
|
47 |
+
and.b32 %r81, %r80, 252;
|
48 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
|
49 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
|
50 |
+
.loc 1 23 28
|
51 |
+
mov.u32 %r1, %ctaid.x;
|
52 |
+
.loc 1 30 40
|
53 |
+
shl.b32 %r82, %r1, 8;
|
54 |
+
.loc 1 30 36
|
55 |
+
or.b32 %r83, %r82, %r81;
|
56 |
+
.loc 1 30 30
|
57 |
+
mul.wide.s32 %rd17, %r83, 4;
|
58 |
+
add.s64 %rd1, %rd9, %rd17;
|
59 |
+
mov.b32 %r6, 0;
|
60 |
+
mov.pred %p1, -1;
|
61 |
+
.loc 1 30 46
|
62 |
+
mov.u32 %r2, 0x0;
|
63 |
+
mov.u32 %r3, 0x0;
|
64 |
+
mov.u32 %r4, 0x0;
|
65 |
+
mov.u32 %r5, 0x0;
|
66 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
67 |
+
@!%p1 mov.u32 %r2, %r6;
|
68 |
+
@!%p1 mov.u32 %r3, %r6;
|
69 |
+
@!%p1 mov.u32 %r4, %r6;
|
70 |
+
@!%p1 mov.u32 %r5, %r6;
|
71 |
+
mov.b32 %f1, %r2;
|
72 |
+
mov.b32 %f2, %r3;
|
73 |
+
mov.b32 %f3, %r4;
|
74 |
+
mov.b32 %f4, %r5;
|
75 |
+
.loc 1 31 30
|
76 |
+
mul.wide.s32 %rd18, %r83, 2;
|
77 |
+
add.s64 %rd2, %rd10, %rd18;
|
78 |
+
.loc 1 31 46
|
79 |
+
mov.u32 %r10, 0x0;
|
80 |
+
mov.u32 %r11, 0x0;
|
81 |
+
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
|
82 |
+
@!%p1 mov.u32 %r10, %r6;
|
83 |
+
@!%p1 mov.u32 %r11, %r6;
|
84 |
+
cvt.u16.u32 %rs1, %r10;
|
85 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
|
86 |
+
cvt.u16.u32 %rs3, %r11;
|
87 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
|
88 |
+
.loc 1 31 67
|
89 |
+
cvt.f32.bf16 %r14, %rs1;
|
90 |
+
mov.b32 %f5, %r14;
|
91 |
+
cvt.f32.bf16 %r15, %rs2;
|
92 |
+
mov.b32 %f6, %r15;
|
93 |
+
cvt.f32.bf16 %r16, %rs3;
|
94 |
+
mov.b32 %f7, %r16;
|
95 |
+
cvt.f32.bf16 %r17, %rs4;
|
96 |
+
mov.b32 %f8, %r17;
|
97 |
+
.loc 1 32 30
|
98 |
+
add.s64 %rd3, %rd11, %rd18;
|
99 |
+
.loc 1 32 46
|
100 |
+
mov.u32 %r18, 0x0;
|
101 |
+
mov.u32 %r19, 0x0;
|
102 |
+
@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
|
103 |
+
@!%p1 mov.u32 %r18, %r6;
|
104 |
+
@!%p1 mov.u32 %r19, %r6;
|
105 |
+
cvt.u16.u32 %rs5, %r18;
|
106 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
|
107 |
+
cvt.u16.u32 %rs7, %r19;
|
108 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
|
109 |
+
.loc 1 32 67
|
110 |
+
cvt.f32.bf16 %r22, %rs5;
|
111 |
+
mov.b32 %f9, %r22;
|
112 |
+
cvt.f32.bf16 %r23, %rs6;
|
113 |
+
mov.b32 %f10, %r23;
|
114 |
+
cvt.f32.bf16 %r24, %rs7;
|
115 |
+
mov.b32 %f11, %r24;
|
116 |
+
cvt.f32.bf16 %r25, %rs8;
|
117 |
+
mov.b32 %f12, %r25;
|
118 |
+
.loc 1 33 30
|
119 |
+
add.s64 %rd4, %rd12, %rd18;
|
120 |
+
.loc 1 33 46
|
121 |
+
mov.u32 %r26, 0x0;
|
122 |
+
mov.u32 %r27, 0x0;
|
123 |
+
@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
|
124 |
+
@!%p1 mov.u32 %r26, %r6;
|
125 |
+
@!%p1 mov.u32 %r27, %r6;
|
126 |
+
cvt.u16.u32 %rs9, %r26;
|
127 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
|
128 |
+
cvt.u16.u32 %rs11, %r27;
|
129 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
|
130 |
+
.loc 1 33 67
|
131 |
+
cvt.f32.bf16 %r30, %rs9;
|
132 |
+
mov.b32 %f13, %r30;
|
133 |
+
cvt.f32.bf16 %r31, %rs10;
|
134 |
+
mov.b32 %f14, %r31;
|
135 |
+
cvt.f32.bf16 %r32, %rs11;
|
136 |
+
mov.b32 %f15, %r32;
|
137 |
+
cvt.f32.bf16 %r33, %rs12;
|
138 |
+
mov.b32 %f16, %r33;
|
139 |
+
.loc 1 34 31
|
140 |
+
add.s64 %rd5, %rd13, %rd18;
|
141 |
+
.loc 1 34 47
|
142 |
+
mov.u32 %r34, 0x0;
|
143 |
+
mov.u32 %r35, 0x0;
|
144 |
+
@%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ];
|
145 |
+
@!%p1 mov.u32 %r34, %r6;
|
146 |
+
@!%p1 mov.u32 %r35, %r6;
|
147 |
+
cvt.u16.u32 %rs13, %r34;
|
148 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; }
|
149 |
+
cvt.u16.u32 %rs15, %r35;
|
150 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; }
|
151 |
+
.loc 1 34 68
|
152 |
+
cvt.f32.bf16 %r38, %rs13;
|
153 |
+
mov.b32 %f17, %r38;
|
154 |
+
cvt.f32.bf16 %r39, %rs14;
|
155 |
+
mov.b32 %f18, %r39;
|
156 |
+
cvt.f32.bf16 %r40, %rs15;
|
157 |
+
mov.b32 %f19, %r40;
|
158 |
+
cvt.f32.bf16 %r41, %rs16;
|
159 |
+
mov.b32 %f20, %r41;
|
160 |
+
.loc 1 35 31
|
161 |
+
mul.wide.u32 %rd19, %r81, 4;
|
162 |
+
add.s64 %rd6, %rd14, %rd19;
|
163 |
+
.loc 1 35 36
|
164 |
+
mov.u32 %r42, 0x0;
|
165 |
+
mov.u32 %r43, 0x0;
|
166 |
+
mov.u32 %r44, 0x0;
|
167 |
+
mov.u32 %r45, 0x0;
|
168 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ];
|
169 |
+
@!%p1 mov.u32 %r42, %r6;
|
170 |
+
@!%p1 mov.u32 %r43, %r6;
|
171 |
+
@!%p1 mov.u32 %r44, %r6;
|
172 |
+
@!%p1 mov.u32 %r45, %r6;
|
173 |
+
.loc 1 37 18
|
174 |
+
add.f32 %f21, %f5, %f1;
|
175 |
+
add.f32 %f22, %f6, %f2;
|
176 |
+
add.f32 %f23, %f7, %f3;
|
177 |
+
.loc 1 39 18
|
178 |
+
add.f32 %f24, %f21, %f9;
|
179 |
+
add.f32 %f25, %f22, %f10;
|
180 |
+
add.f32 %f26, %f23, %f11;
|
181 |
+
.loc 1 41 18
|
182 |
+
add.f32 %f27, %f25, %f14;
|
183 |
+
add.f32 %f28, %f26, %f15;
|
184 |
+
.loc 1 43 19
|
185 |
+
add.f32 %f29, %f27, %f18;
|
186 |
+
add.f32 %f30, %f28, %f19;
|
187 |
+
.loc 1 41 18
|
188 |
+
add.f32 %f31, %f24, %f13;
|
189 |
+
add.f32 %f32, %f8, %f4;
|
190 |
+
.loc 1 43 19
|
191 |
+
add.f32 %f33, %f32, %f12;
|
192 |
+
add.f32 %f34, %f31, %f17;
|
193 |
+
$L__tmp1:
|
194 |
+
.loc 2 233 15
|
195 |
+
add.f32 %f35, %f34, %f29;
|
196 |
+
add.f32 %f36, %f33, %f16;
|
197 |
+
add.f32 %f37, %f35, %f30;
|
198 |
+
add.f32 %f38, %f36, %f20;
|
199 |
+
mov.b32 %r71, %f38;
|
200 |
+
add.f32 %f39, %f37, %f38;
|
201 |
+
$L__tmp2:
|
202 |
+
.loc 2 243 36
|
203 |
+
mov.b32 %r84, %f39;
|
204 |
+
shfl.sync.bfly.b32 %r85, %r84, 16, 31, -1;
|
205 |
+
mov.b32 %f40, %r85;
|
206 |
+
$L__tmp3:
|
207 |
+
.loc 2 233 15
|
208 |
+
add.f32 %f41, %f39, %f40;
|
209 |
+
$L__tmp4:
|
210 |
+
.loc 2 243 36
|
211 |
+
mov.b32 %r86, %f41;
|
212 |
+
shfl.sync.bfly.b32 %r87, %r86, 8, 31, -1;
|
213 |
+
mov.b32 %f42, %r87;
|
214 |
+
$L__tmp5:
|
215 |
+
.loc 2 233 15
|
216 |
+
add.f32 %f43, %f41, %f42;
|
217 |
+
$L__tmp6:
|
218 |
+
.loc 2 243 36
|
219 |
+
mov.b32 %r88, %f43;
|
220 |
+
shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1;
|
221 |
+
mov.b32 %f44, %r89;
|
222 |
+
$L__tmp7:
|
223 |
+
.loc 2 233 15
|
224 |
+
add.f32 %f45, %f43, %f44;
|
225 |
+
$L__tmp8:
|
226 |
+
.loc 2 243 36
|
227 |
+
mov.b32 %r90, %f45;
|
228 |
+
shfl.sync.bfly.b32 %r91, %r90, 2, 31, -1;
|
229 |
+
mov.b32 %f46, %r91;
|
230 |
+
$L__tmp9:
|
231 |
+
.loc 2 233 15
|
232 |
+
add.f32 %f47, %f45, %f46;
|
233 |
+
$L__tmp10:
|
234 |
+
.loc 2 243 36
|
235 |
+
mov.b32 %r92, %f47;
|
236 |
+
shfl.sync.bfly.b32 %r93, %r92, 1, 31, -1;
|
237 |
+
mov.b32 %f48, %r93;
|
238 |
+
$L__tmp11:
|
239 |
+
.loc 2 233 15
|
240 |
+
add.f32 %f49, %f47, %f48;
|
241 |
+
$L__tmp12:
|
242 |
+
.loc 2 243 36
|
243 |
+
setp.eq.s32 %p23, %r79, 0;
|
244 |
+
shr.u32 %r94, %r78, 3;
|
245 |
+
and.b32 %r95, %r94, 4;
|
246 |
+
mov.u32 %r96, global_smem;
|
247 |
+
add.s32 %r50, %r96, %r95;
|
248 |
+
mov.b32 %r51, %f49;
|
249 |
+
@%p23 st.shared.b32 [ %r50 + 0 ], %r51;
|
250 |
+
bar.sync 0;
|
251 |
+
setp.lt.s32 %p24, %r78, 2;
|
252 |
+
add.s32 %r53, %r96, %r80;
|
253 |
+
@%p24 ld.shared.b32 %r52, [ %r53 + 0 ];
|
254 |
+
mov.b32 %f50, %r52;
|
255 |
+
shfl.sync.bfly.b32 %r97, %r52, 1, 31, -1;
|
256 |
+
mov.b32 %f51, %r97;
|
257 |
+
$L__tmp13:
|
258 |
+
.loc 2 233 15
|
259 |
+
add.f32 %f52, %f50, %f51;
|
260 |
+
$L__tmp14:
|
261 |
+
.loc 2 243 36
|
262 |
+
and.b32 %r98, %r78, 1;
|
263 |
+
setp.eq.b32 %p31, %r98, 1;
|
264 |
+
not.pred %p32, %p31;
|
265 |
+
and.pred %p25, %p24, %p32;
|
266 |
+
mov.b32 %r55, %f52;
|
267 |
+
@%p25 st.shared.b32 [ %r53 + 0 ], %r55;
|
268 |
+
bar.sync 0;
|
269 |
+
ld.shared.f32 %f53, [global_smem];
|
270 |
+
$L__tmp15:
|
271 |
+
.loc 3 8 15
|
272 |
+
add.f32 %f54, %f53, 0f00000000;
|
273 |
+
$L__tmp16:
|
274 |
+
.loc 1 51 20
|
275 |
+
mov.b32 %r57, %f54;
|
276 |
+
mov.b32 %r58, 1132462080;
|
277 |
+
div.full.f32 %r56, %r57, %r58;
|
278 |
+
mov.b32 %f55, %r56;
|
279 |
+
.loc 1 52 20
|
280 |
+
sub.f32 %f56, %f34, %f55;
|
281 |
+
sub.f32 %f57, %f29, %f55;
|
282 |
+
sub.f32 %f58, %f30, %f55;
|
283 |
+
sub.f32 %f59, %f38, %f55;
|
284 |
+
.loc 1 53 20
|
285 |
+
mul.f32 %f60, %f57, %f57;
|
286 |
+
$L__tmp17:
|
287 |
+
.loc 2 243 36
|
288 |
+
bar.sync 0;
|
289 |
+
$L__tmp18:
|
290 |
+
.loc 2 233 15
|
291 |
+
fma.rn.f32 %f61, %f56, %f56, %f60;
|
292 |
+
fma.rn.f32 %f62, %f58, %f58, %f61;
|
293 |
+
fma.rn.f32 %f63, %f59, %f59, %f62;
|
294 |
+
$L__tmp19:
|
295 |
+
.loc 2 243 36
|
296 |
+
mov.b32 %r99, %f63;
|
297 |
+
shfl.sync.bfly.b32 %r100, %r99, 16, 31, -1;
|
298 |
+
mov.b32 %f64, %r100;
|
299 |
+
$L__tmp20:
|
300 |
+
.loc 2 233 15
|
301 |
+
add.f32 %f65, %f63, %f64;
|
302 |
+
$L__tmp21:
|
303 |
+
.loc 2 243 36
|
304 |
+
mov.b32 %r101, %f65;
|
305 |
+
shfl.sync.bfly.b32 %r102, %r101, 8, 31, -1;
|
306 |
+
mov.b32 %f66, %r102;
|
307 |
+
$L__tmp22:
|
308 |
+
.loc 2 233 15
|
309 |
+
add.f32 %f67, %f65, %f66;
|
310 |
+
$L__tmp23:
|
311 |
+
.loc 2 243 36
|
312 |
+
mov.b32 %r103, %f67;
|
313 |
+
shfl.sync.bfly.b32 %r104, %r103, 4, 31, -1;
|
314 |
+
mov.b32 %f68, %r104;
|
315 |
+
$L__tmp24:
|
316 |
+
.loc 2 233 15
|
317 |
+
add.f32 %f69, %f67, %f68;
|
318 |
+
$L__tmp25:
|
319 |
+
.loc 2 243 36
|
320 |
+
mov.b32 %r105, %f69;
|
321 |
+
shfl.sync.bfly.b32 %r106, %r105, 2, 31, -1;
|
322 |
+
mov.b32 %f70, %r106;
|
323 |
+
$L__tmp26:
|
324 |
+
.loc 2 233 15
|
325 |
+
add.f32 %f71, %f69, %f70;
|
326 |
+
$L__tmp27:
|
327 |
+
.loc 2 243 36
|
328 |
+
mov.b32 %r107, %f71;
|
329 |
+
shfl.sync.bfly.b32 %r108, %r107, 1, 31, -1;
|
330 |
+
mov.b32 %f72, %r108;
|
331 |
+
$L__tmp28:
|
332 |
+
.loc 2 233 15
|
333 |
+
add.f32 %f73, %f71, %f72;
|
334 |
+
$L__tmp29:
|
335 |
+
.loc 2 243 36
|
336 |
+
mov.b32 %r60, %f73;
|
337 |
+
@%p23 st.shared.b32 [ %r50 + 0 ], %r60;
|
338 |
+
bar.sync 0;
|
339 |
+
@%p24 ld.shared.b32 %r61, [ %r53 + 0 ];
|
340 |
+
mov.b32 %f74, %r61;
|
341 |
+
shfl.sync.bfly.b32 %r109, %r61, 1, 31, -1;
|
342 |
+
mov.b32 %f75, %r109;
|
343 |
+
$L__tmp30:
|
344 |
+
.loc 2 233 15
|
345 |
+
add.f32 %f76, %f74, %f75;
|
346 |
+
$L__tmp31:
|
347 |
+
.loc 2 243 36
|
348 |
+
mov.b32 %r64, %f76;
|
349 |
+
@%p25 st.shared.b32 [ %r53 + 0 ], %r64;
|
350 |
+
bar.sync 0;
|
351 |
+
ld.shared.f32 %f77, [global_smem];
|
352 |
+
$L__tmp32:
|
353 |
+
.loc 3 8 15
|
354 |
+
add.f32 %f78, %f77, 0f00000000;
|
355 |
+
$L__tmp33:
|
356 |
+
.loc 1 59 20
|
357 |
+
mov.b32 %r66, %f78;
|
358 |
+
div.full.f32 %r65, %r66, %r58;
|
359 |
+
mov.b32 %f79, %r65;
|
360 |
+
.loc 1 61 20
|
361 |
+
add.f32 %f80, %f79, 0f3727C5AC;
|
362 |
+
.loc 1 62 26
|
363 |
+
rsqrt.approx.ftz.f32 %f81, %f80;
|
364 |
+
.loc 1 35 36
|
365 |
+
mov.b32 %f82, %r45;
|
366 |
+
mov.b32 %f83, %r44;
|
367 |
+
mov.b32 %f84, %r43;
|
368 |
+
mov.b32 %f85, %r42;
|
369 |
+
.loc 1 63 20
|
370 |
+
mul.f32 %f86, %f56, %f81;
|
371 |
+
mul.f32 %f87, %f57, %f81;
|
372 |
+
mul.f32 %f88, %f58, %f81;
|
373 |
+
mul.f32 %f89, %f59, %f81;
|
374 |
+
.loc 1 64 20
|
375 |
+
mul.f32 %f90, %f86, %f85;
|
376 |
+
mul.f32 %f91, %f87, %f84;
|
377 |
+
mul.f32 %f92, %f88, %f83;
|
378 |
+
mul.f32 %f93, %f89, %f82;
|
379 |
+
.loc 1 66 25
|
380 |
+
add.s64 %rd7, %rd15, %rd17;
|
381 |
+
.loc 1 66 48
|
382 |
+
mov.b32 %r68, %f34;
|
383 |
+
mov.b32 %r69, %f29;
|
384 |
+
mov.b32 %r70, %f30;
|
385 |
+
@%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 };
|
386 |
+
.loc 1 67 25
|
387 |
+
add.s64 %rd8, %rd16, %rd18;
|
388 |
+
.loc 1 67 48
|
389 |
+
mov.b32 %r72, %f90;
|
390 |
+
cvt.rn.bf16.f32 %rs17, %r72;
|
391 |
+
mov.b32 %r73, %f91;
|
392 |
+
cvt.rn.bf16.f32 %rs18, %r73;
|
393 |
+
mov.b32 %r74, %f92;
|
394 |
+
cvt.rn.bf16.f32 %rs19, %r74;
|
395 |
+
mov.b32 %r75, %f93;
|
396 |
+
cvt.rn.bf16.f32 %rs20, %r75;
|
397 |
+
mov.b32 %r110, {%rs17, %rs18};
|
398 |
+
mov.b32 %r111, {%rs19, %rs20};
|
399 |
+
@%p1 st.global.v2.b32 [ %rd8 + 0 ], { %r110, %r111 };
|
400 |
+
.loc 1 67 4
|
401 |
+
ret;
|
402 |
+
$L__tmp34:
|
403 |
+
$L__func_end0:
|
404 |
+
|
405 |
+
}
|
406 |
+
// .globl __nv_rsqrtf
|
407 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
408 |
+
.param .b32 __nv_rsqrtf_param_0
|
409 |
+
)
|
410 |
+
{
|
411 |
+
.reg .f32 %f<3>;
|
412 |
+
$L__func_begin1:
|
413 |
+
|
414 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
415 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
416 |
+
st.param.f32 [func_retval0+0], %f2;
|
417 |
+
ret;
|
418 |
+
$L__func_end1:
|
419 |
+
|
420 |
+
}
|
421 |
+
.file 1 "/tmp/torchinductor_root/yb/cybxnh26qvsbmxmvdr54vaav2ezk2qxu7562fhhsn4lvyvqgoglw.py"
|
422 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
423 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
424 |
+
.section .debug_abbrev
|
425 |
+
{
|
426 |
+
.b8 1
|
427 |
+
.b8 17
|
428 |
+
.b8 1
|
429 |
+
.b8 37
|
430 |
+
.b8 8
|
431 |
+
.b8 19
|
432 |
+
.b8 5
|
433 |
+
.b8 3
|
434 |
+
.b8 8
|
435 |
+
.b8 16
|
436 |
+
.b8 6
|
437 |
+
.b8 27
|
438 |
+
.b8 8
|
439 |
+
.b8 180
|
440 |
+
.b8 66
|
441 |
+
.b8 12
|
442 |
+
.b8 17
|
443 |
+
.b8 1
|
444 |
+
.b8 18
|
445 |
+
.b8 1
|
446 |
+
.b8 0
|
447 |
+
.b8 0
|
448 |
+
.b8 2
|
449 |
+
.b8 46
|
450 |
+
.b8 0
|
451 |
+
.b8 135
|
452 |
+
.b8 64
|
453 |
+
.b8 8
|
454 |
+
.b8 3
|
455 |
+
.b8 8
|
456 |
+
.b8 58
|
457 |
+
.b8 11
|
458 |
+
.b8 59
|
459 |
+
.b8 11
|
460 |
+
.b8 63
|
461 |
+
.b8 12
|
462 |
+
.b8 32
|
463 |
+
.b8 11
|
464 |
+
.b8 0
|
465 |
+
.b8 0
|
466 |
+
.b8 3
|
467 |
+
.b8 46
|
468 |
+
.b8 1
|
469 |
+
.b8 17
|
470 |
+
.b8 1
|
471 |
+
.b8 18
|
472 |
+
.b8 1
|
473 |
+
.b8 64
|
474 |
+
.b8 10
|
475 |
+
.b8 49
|
476 |
+
.b8 19
|
477 |
+
.b8 0
|
478 |
+
.b8 0
|
479 |
+
.b8 4
|
480 |
+
.b8 29
|
481 |
+
.b8 1
|
482 |
+
.b8 49
|
483 |
+
.b8 19
|
484 |
+
.b8 17
|
485 |
+
.b8 1
|
486 |
+
.b8 18
|
487 |
+
.b8 1
|
488 |
+
.b8 88
|
489 |
+
.b8 11
|
490 |
+
.b8 89
|
491 |
+
.b8 11
|
492 |
+
.b8 87
|
493 |
+
.b8 11
|
494 |
+
.b8 0
|
495 |
+
.b8 0
|
496 |
+
.b8 5
|
497 |
+
.b8 29
|
498 |
+
.b8 0
|
499 |
+
.b8 49
|
500 |
+
.b8 19
|
501 |
+
.b8 17
|
502 |
+
.b8 1
|
503 |
+
.b8 18
|
504 |
+
.b8 1
|
505 |
+
.b8 88
|
506 |
+
.b8 11
|
507 |
+
.b8 89
|
508 |
+
.b8 11
|
509 |
+
.b8 87
|
510 |
+
.b8 11
|
511 |
+
.b8 0
|
512 |
+
.b8 0
|
513 |
+
.b8 0
|
514 |
+
}
|
515 |
+
.section .debug_info
|
516 |
+
{
|
517 |
+
.b32 407
|
518 |
+
.b8 2
|
519 |
+
.b8 0
|
520 |
+
.b32 .debug_abbrev
|
521 |
+
.b8 8
|
522 |
+
.b8 1
|
523 |
+
.b8 116
|
524 |
+
.b8 114
|
525 |
+
.b8 105
|
526 |
+
.b8 116
|
527 |
+
.b8 111
|
528 |
+
.b8 110
|
529 |
+
.b8 0
|
530 |
+
.b8 2
|
531 |
+
.b8 0
|
532 |
+
.b8 99
|
533 |
+
.b8 121
|
534 |
+
.b8 98
|
535 |
+
.b8 120
|
536 |
+
.b8 110
|
537 |
+
.b8 104
|
538 |
+
.b8 50
|
539 |
+
.b8 54
|
540 |
+
.b8 113
|
541 |
+
.b8 118
|
542 |
+
.b8 115
|
543 |
+
.b8 98
|
544 |
+
.b8 109
|
545 |
+
.b8 120
|
546 |
+
.b8 109
|
547 |
+
.b8 118
|
548 |
+
.b8 100
|
549 |
+
.b8 114
|
550 |
+
.b8 53
|
551 |
+
.b8 52
|
552 |
+
.b8 118
|
553 |
+
.b8 97
|
554 |
+
.b8 97
|
555 |
+
.b8 118
|
556 |
+
.b8 50
|
557 |
+
.b8 101
|
558 |
+
.b8 122
|
559 |
+
.b8 107
|
560 |
+
.b8 50
|
561 |
+
.b8 113
|
562 |
+
.b8 120
|
563 |
+
.b8 117
|
564 |
+
.b8 55
|
565 |
+
.b8 53
|
566 |
+
.b8 54
|
567 |
+
.b8 50
|
568 |
+
.b8 102
|
569 |
+
.b8 104
|
570 |
+
.b8 104
|
571 |
+
.b8 115
|
572 |
+
.b8 110
|
573 |
+
.b8 52
|
574 |
+
.b8 108
|
575 |
+
.b8 118
|
576 |
+
.b8 121
|
577 |
+
.b8 118
|
578 |
+
.b8 113
|
579 |
+
.b8 103
|
580 |
+
.b8 111
|
581 |
+
.b8 103
|
582 |
+
.b8 108
|
583 |
+
.b8 119
|
584 |
+
.b8 46
|
585 |
+
.b8 112
|
586 |
+
.b8 121
|
587 |
+
.b8 0
|
588 |
+
.b32 .debug_line
|
589 |
+
.b8 47
|
590 |
+
.b8 116
|
591 |
+
.b8 109
|
592 |
+
.b8 112
|
593 |
+
.b8 47
|
594 |
+
.b8 116
|
595 |
+
.b8 111
|
596 |
+
.b8 114
|
597 |
+
.b8 99
|
598 |
+
.b8 104
|
599 |
+
.b8 105
|
600 |
+
.b8 110
|
601 |
+
.b8 100
|
602 |
+
.b8 117
|
603 |
+
.b8 99
|
604 |
+
.b8 116
|
605 |
+
.b8 111
|
606 |
+
.b8 114
|
607 |
+
.b8 95
|
608 |
+
.b8 114
|
609 |
+
.b8 111
|
610 |
+
.b8 111
|
611 |
+
.b8 116
|
612 |
+
.b8 47
|
613 |
+
.b8 121
|
614 |
+
.b8 98
|
615 |
+
.b8 0
|
616 |
+
.b8 1
|
617 |
+
.b64 $L__func_begin0
|
618 |
+
.b64 $L__func_end0
|
619 |
+
.b8 2
|
620 |
+
.b8 116
|
621 |
+
.b8 114
|
622 |
+
.b8 105
|
623 |
+
.b8 116
|
624 |
+
.b8 111
|
625 |
+
.b8 110
|
626 |
+
.b8 95
|
627 |
+
.b8 95
|
628 |
+
.b8 48
|
629 |
+
.b8 100
|
630 |
+
.b8 49
|
631 |
+
.b8 100
|
632 |
+
.b8 50
|
633 |
+
.b8 100
|
634 |
+
.b8 51
|
635 |
+
.b8 100
|
636 |
+
.b8 52
|
637 |
+
.b8 100
|
638 |
+
.b8 53
|
639 |
+
.b8 100
|
640 |
+
.b8 54
|
641 |
+
.b8 100
|
642 |
+
.b8 55
|
643 |
+
.b8 100
|
644 |
+
.b8 56
|
645 |
+
.b8 100
|
646 |
+
.b8 101
|
647 |
+
.b8 57
|
648 |
+
.b8 100
|
649 |
+
.b8 101
|
650 |
+
.b8 0
|
651 |
+
.b8 116
|
652 |
+
.b8 114
|
653 |
+
.b8 105
|
654 |
+
.b8 116
|
655 |
+
.b8 111
|
656 |
+
.b8 110
|
657 |
+
.b8 95
|
658 |
+
.b8 95
|
659 |
+
.b8 48
|
660 |
+
.b8 100
|
661 |
+
.b8 49
|
662 |
+
.b8 100
|
663 |
+
.b8 50
|
664 |
+
.b8 100
|
665 |
+
.b8 51
|
666 |
+
.b8 100
|
667 |
+
.b8 52
|
668 |
+
.b8 100
|
669 |
+
.b8 53
|
670 |
+
.b8 100
|
671 |
+
.b8 54
|
672 |
+
.b8 100
|
673 |
+
.b8 55
|
674 |
+
.b8 100
|
675 |
+
.b8 56
|
676 |
+
.b8 100
|
677 |
+
.b8 101
|
678 |
+
.b8 57
|
679 |
+
.b8 100
|
680 |
+
.b8 101
|
681 |
+
.b8 0
|
682 |
+
.b8 1
|
683 |
+
.b8 18
|
684 |
+
.b8 1
|
685 |
+
.b8 1
|
686 |
+
.b8 3
|
687 |
+
.b64 $L__func_begin0
|
688 |
+
.b64 $L__func_end0
|
689 |
+
.b8 1
|
690 |
+
.b8 156
|
691 |
+
.b32 125
|
692 |
+
.b8 4
|
693 |
+
.b32 125
|
694 |
+
.b64 $L__tmp1
|
695 |
+
.b64 $L__tmp14
|
696 |
+
.b8 2
|
697 |
+
.b8 48
|
698 |
+
.b8 59
|
699 |
+
.b8 5
|
700 |
+
.b32 125
|
701 |
+
.b64 $L__tmp1
|
702 |
+
.b64 $L__tmp14
|
703 |
+
.b8 2
|
704 |
+
.b8 243
|
705 |
+
.b8 36
|
706 |
+
.b8 0
|
707 |
+
.b8 5
|
708 |
+
.b32 125
|
709 |
+
.b64 $L__tmp2
|
710 |
+
.b64 $L__tmp15
|
711 |
+
.b8 2
|
712 |
+
.b8 48
|
713 |
+
.b8 59
|
714 |
+
.b8 5
|
715 |
+
.b32 125
|
716 |
+
.b64 $L__tmp15
|
717 |
+
.b64 $L__tmp16
|
718 |
+
.b8 3
|
719 |
+
.b8 48
|
720 |
+
.b8 45
|
721 |
+
.b8 5
|
722 |
+
.b32 125
|
723 |
+
.b64 $L__tmp17
|
724 |
+
.b64 $L__tmp32
|
725 |
+
.b8 2
|
726 |
+
.b8 56
|
727 |
+
.b8 59
|
728 |
+
.b8 4
|
729 |
+
.b32 125
|
730 |
+
.b64 $L__tmp18
|
731 |
+
.b64 $L__tmp31
|
732 |
+
.b8 2
|
733 |
+
.b8 56
|
734 |
+
.b8 59
|
735 |
+
.b8 5
|
736 |
+
.b32 125
|
737 |
+
.b64 $L__tmp18
|
738 |
+
.b64 $L__tmp31
|
739 |
+
.b8 2
|
740 |
+
.b8 243
|
741 |
+
.b8 36
|
742 |
+
.b8 0
|
743 |
+
.b8 5
|
744 |
+
.b32 125
|
745 |
+
.b64 $L__tmp32
|
746 |
+
.b64 $L__tmp33
|
747 |
+
.b8 3
|
748 |
+
.b8 56
|
749 |
+
.b8 45
|
750 |
+
.b8 0
|
751 |
+
.b8 0
|
752 |
+
}
|
753 |
+
.section .debug_pubnames
|
754 |
+
{
|
755 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
756 |
+
$L__pubNames_start0:
|
757 |
+
.b8 2
|
758 |
+
.b8 0
|
759 |
+
.b32 .debug_info
|
760 |
+
.b32 411
|
761 |
+
.b32 125
|
762 |
+
.b8 116
|
763 |
+
.b8 114
|
764 |
+
.b8 105
|
765 |
+
.b8 116
|
766 |
+
.b8 111
|
767 |
+
.b8 110
|
768 |
+
.b8 95
|
769 |
+
.b8 95
|
770 |
+
.b8 48
|
771 |
+
.b8 100
|
772 |
+
.b8 49
|
773 |
+
.b8 100
|
774 |
+
.b8 50
|
775 |
+
.b8 100
|
776 |
+
.b8 51
|
777 |
+
.b8 100
|
778 |
+
.b8 52
|
779 |
+
.b8 100
|
780 |
+
.b8 53
|
781 |
+
.b8 100
|
782 |
+
.b8 54
|
783 |
+
.b8 100
|
784 |
+
.b8 55
|
785 |
+
.b8 100
|
786 |
+
.b8 56
|
787 |
+
.b8 100
|
788 |
+
.b8 101
|
789 |
+
.b8 57
|
790 |
+
.b8 100
|
791 |
+
.b8 101
|
792 |
+
.b8 0
|
793 |
+
.b32 0
|
794 |
+
$L__pubNames_end0:
|
795 |
+
}
|
796 |
+
.section .debug_pubtypes
|
797 |
+
{
|
798 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
799 |
+
$L__pubTypes_start0:
|
800 |
+
.b8 2
|
801 |
+
.b8 0
|
802 |
+
.b32 .debug_info
|
803 |
+
.b32 411
|
804 |
+
.b32 0
|
805 |
+
$L__pubTypes_end0:
|
806 |
+
}
|
807 |
+
.section .debug_loc { }
|
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant 9.99999974E-6 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
8 |
+
%c256_i32 = arith.constant 256 : i32
|
9 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
20 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
21 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
22 |
+
%11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
23 |
+
%12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
27 |
+
%16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
28 |
+
%17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
29 |
+
%18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
30 |
+
%19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
31 |
+
%20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
32 |
+
%21 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
33 |
+
%22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
34 |
+
%23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
35 |
+
%24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
36 |
+
%25 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
37 |
+
%26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
38 |
+
%27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
39 |
+
%28 = arith.addf %8, %12 : tensor<256xf32, #blocked>
|
40 |
+
%29 = arith.addf %28, %16 : tensor<256xf32, #blocked>
|
41 |
+
%30 = arith.addf %29, %20 : tensor<256xf32, #blocked>
|
42 |
+
%31 = arith.addf %30, %24 : tensor<256xf32, #blocked>
|
43 |
+
%32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
44 |
+
%33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
|
45 |
+
^bb0(%arg10: f32, %arg11: f32):
|
46 |
+
%53 = arith.addf %arg10, %arg11 : f32
|
47 |
+
tt.reduce.return %53 : f32
|
48 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
49 |
+
%34 = arith.addf %33, %cst_2 : f32
|
50 |
+
%35 = arith.divf %34, %cst_1 : f32
|
51 |
+
%36 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
|
52 |
+
%37 = arith.subf %31, %36 : tensor<256xf32, #blocked>
|
53 |
+
%38 = arith.mulf %37, %37 : tensor<256xf32, #blocked>
|
54 |
+
%39 = arith.select %2, %38, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
55 |
+
%40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({
|
56 |
+
^bb0(%arg10: f32, %arg11: f32):
|
57 |
+
%53 = arith.addf %arg10, %arg11 : f32
|
58 |
+
tt.reduce.return %53 : f32
|
59 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
60 |
+
%41 = arith.addf %40, %cst_2 : f32
|
61 |
+
%42 = arith.divf %41, %cst_1 : f32
|
62 |
+
%43 = arith.addf %42, %cst_0 : f32
|
63 |
+
%44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
64 |
+
%45 = tt.splat %44 : (f32) -> tensor<256xf32, #blocked>
|
65 |
+
%46 = arith.mulf %37, %45 : tensor<256xf32, #blocked>
|
66 |
+
%47 = arith.mulf %46, %27 : tensor<256xf32, #blocked>
|
67 |
+
%48 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
68 |
+
%49 = tt.addptr %48, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
69 |
+
tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
70 |
+
%50 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
71 |
+
%51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
72 |
+
%52 = arith.truncf %47 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
73 |
+
tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
74 |
+
tt.return
|
75 |
+
}
|
76 |
+
}
|
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 9.99999974E-6 : f32
|
8 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
9 |
+
%cst_4 = arith.constant dense<256> : tensor<256xi32>
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
12 |
+
%2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
|
13 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
14 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
15 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
16 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
17 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
18 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
19 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
20 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
21 |
+
%11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
22 |
+
%12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
|
23 |
+
%13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
24 |
+
%14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
25 |
+
%15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
26 |
+
%16 = arith.addf %8, %12 : tensor<256xf32>
|
27 |
+
%17 = arith.select %2, %16, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
28 |
+
%18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
|
29 |
+
^bb0(%arg6: f32, %arg7: f32):
|
30 |
+
%36 = arith.addf %arg6, %arg7 : f32
|
31 |
+
tt.reduce.return %36 : f32
|
32 |
+
}) : (tensor<256xf32>) -> f32
|
33 |
+
%19 = arith.addf %18, %cst_0 : f32
|
34 |
+
%20 = arith.divf %19, %cst_1 : f32
|
35 |
+
%21 = tt.splat %20 : (f32) -> tensor<256xf32>
|
36 |
+
%22 = arith.subf %16, %21 : tensor<256xf32>
|
37 |
+
%23 = arith.mulf %22, %22 : tensor<256xf32>
|
38 |
+
%24 = arith.select %2, %23, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
39 |
+
%25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({
|
40 |
+
^bb0(%arg6: f32, %arg7: f32):
|
41 |
+
%36 = arith.addf %arg6, %arg7 : f32
|
42 |
+
tt.reduce.return %36 : f32
|
43 |
+
}) : (tensor<256xf32>) -> f32
|
44 |
+
%26 = arith.addf %25, %cst_0 : f32
|
45 |
+
%27 = arith.divf %26, %cst_1 : f32
|
46 |
+
%28 = arith.addf %27, %cst_2 : f32
|
47 |
+
%29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
48 |
+
%30 = tt.splat %29 : (f32) -> tensor<256xf32>
|
49 |
+
%31 = arith.mulf %22, %30 : tensor<256xf32>
|
50 |
+
%32 = arith.mulf %31, %15 : tensor<256xf32>
|
51 |
+
%33 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
52 |
+
%34 = tt.addptr %33, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
53 |
+
%35 = arith.truncf %32 : tensor<256xf32> to tensor<256xbf16>
|
54 |
+
tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
55 |
+
tt.return
|
56 |
+
}
|
57 |
+
}
|
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin
ADDED
Binary file (23.9 kB). View file
|
|
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx
ADDED
@@ -0,0 +1,764 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1de
|
10 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
11 |
+
|
12 |
+
.visible .entry triton__0d1de(
|
13 |
+
.param .u64 triton__0d1de_param_0,
|
14 |
+
.param .u32 triton__0d1de_param_1
|
15 |
+
)
|
16 |
+
.maxntid 128, 1, 1
|
17 |
+
{
|
18 |
+
.reg .pred %p<27>;
|
19 |
+
.reg .b16 %rs<17>;
|
20 |
+
.reg .b32 %r<67>;
|
21 |
+
.reg .f32 %f<431>;
|
22 |
+
.reg .b64 %rd<6>;
|
23 |
+
.loc 1 18 0
|
24 |
+
$L__func_begin0:
|
25 |
+
.loc 1 18 0
|
26 |
+
|
27 |
+
ld.param.u64 %rd3, [triton__0d1de_param_0];
|
28 |
+
$L__tmp0:
|
29 |
+
.loc 1 21 36
|
30 |
+
mov.u32 %r14, %tid.x;
|
31 |
+
shl.b32 %r15, %r14, 3;
|
32 |
+
and.b32 %r16, %r15, 1016;
|
33 |
+
.loc 1 20 28
|
34 |
+
mov.u32 %r1, %ctaid.x;
|
35 |
+
.loc 1 20 33
|
36 |
+
shl.b32 %r17, %r1, 10;
|
37 |
+
.loc 1 21 23
|
38 |
+
or.b32 %r18, %r17, %r16;
|
39 |
+
.loc 1 24 34
|
40 |
+
mul.wide.s32 %rd4, %r18, 2;
|
41 |
+
add.s64 %rd5, %rd3, %rd4;
|
42 |
+
mov.pred %p1, -1;
|
43 |
+
.loc 1 24 39
|
44 |
+
mov.u32 %r2, 0x0;
|
45 |
+
mov.u32 %r3, 0x0;
|
46 |
+
mov.u32 %r4, 0x0;
|
47 |
+
mov.u32 %r5, 0x0;
|
48 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd5 + 0 ];
|
49 |
+
cvt.u16.u32 %rs1, %r2;
|
50 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
51 |
+
cvt.u16.u32 %rs3, %r3;
|
52 |
+
.loc 1 24 48
|
53 |
+
cvt.f32.bf16 %r6, %rs1;
|
54 |
+
mov.b32 %f1, %r6;
|
55 |
+
cvt.f32.bf16 %r7, %rs2;
|
56 |
+
mov.b32 %f2, %r7;
|
57 |
+
.loc 1 29 18
|
58 |
+
mul.f32 %f9, %f1, 0f3F3504F3;
|
59 |
+
.loc 1 30 23
|
60 |
+
abs.ftz.f32 %f17, %f9;
|
61 |
+
setp.ge.f32 %p2, %f17, 0f3F8060FE;
|
62 |
+
mov.f32 %f365, 0f3789CA3C;
|
63 |
+
mov.f32 %f364, 0fB9F560B9;
|
64 |
+
mov.f32 %f363, 0f3BAC840B;
|
65 |
+
mov.f32 %f362, 0fBD0C8162;
|
66 |
+
mov.f32 %f361, 0f3E1CF906;
|
67 |
+
mov.f32 %f360, 0f3F6A937E;
|
68 |
+
mov.f32 %f359, 0f3F20D842;
|
69 |
+
mov.f32 %f366, %f17;
|
70 |
+
@%p2 bra $L__BB0_2;
|
71 |
+
.loc 1 0 23
|
72 |
+
mov.f32 %f365, 0f38B1E96A;
|
73 |
+
mov.f32 %f364, 0fBA574D20;
|
74 |
+
mov.f32 %f363, 0f3BAAD5EA;
|
75 |
+
mov.f32 %f362, 0fBCDC1BE7;
|
76 |
+
mov.f32 %f361, 0f3DE718AF;
|
77 |
+
mov.f32 %f360, 0fBEC093AC;
|
78 |
+
mov.f32 %f359, 0f3E0375D3;
|
79 |
+
.loc 1 30 23
|
80 |
+
mul.f32 %f366, %f9, %f9;
|
81 |
+
$L__BB0_2:
|
82 |
+
.loc 1 0 0
|
83 |
+
cvt.f32.bf16 %r8, %rs3;
|
84 |
+
mul.f32 %f10, %f2, 0f3F3504F3;
|
85 |
+
.loc 1 30 23
|
86 |
+
setp.ltu.f32 %p3, %f17, 0f3F8060FE;
|
87 |
+
fma.rn.ftz.f32 %f135, %f365, %f366, %f364;
|
88 |
+
fma.rn.ftz.f32 %f136, %f135, %f366, %f363;
|
89 |
+
fma.rn.ftz.f32 %f137, %f136, %f366, %f362;
|
90 |
+
fma.rn.ftz.f32 %f138, %f137, %f366, %f361;
|
91 |
+
fma.rn.ftz.f32 %f139, %f138, %f366, %f360;
|
92 |
+
fma.rn.ftz.f32 %f140, %f139, %f366, %f359;
|
93 |
+
neg.f32 %f141, %f366;
|
94 |
+
selp.f32 %f142, %f141, %f9, %p2;
|
95 |
+
fma.rn.ftz.f32 %f367, %f140, %f142, %f142;
|
96 |
+
mov.f32 %f358, 0f3F800000;
|
97 |
+
@%p3 bra $L__BB0_4;
|
98 |
+
ex2.approx.ftz.f32 %f143, %f367;
|
99 |
+
sub.f32 %f145, %f358, %f143;
|
100 |
+
mov.b32 %r19, %f145;
|
101 |
+
mov.b32 %r20, %f9;
|
102 |
+
and.b32 %r21, %r20, -2147483648;
|
103 |
+
or.b32 %r22, %r21, %r19;
|
104 |
+
mov.b32 %f367, %r22;
|
105 |
+
$L__BB0_4:
|
106 |
+
.loc 1 0 0
|
107 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
108 |
+
mov.b32 %f3, %r8;
|
109 |
+
.loc 1 30 23
|
110 |
+
abs.ftz.f32 %f30, %f10;
|
111 |
+
setp.ge.f32 %p5, %f30, 0f3F8060FE;
|
112 |
+
mov.f32 %f374, 0f3789CA3C;
|
113 |
+
mov.f32 %f373, 0fB9F560B9;
|
114 |
+
mov.f32 %f372, 0f3BAC840B;
|
115 |
+
mov.f32 %f371, 0fBD0C8162;
|
116 |
+
mov.f32 %f370, 0f3E1CF906;
|
117 |
+
mov.f32 %f369, 0f3F6A937E;
|
118 |
+
mov.f32 %f368, 0f3F20D842;
|
119 |
+
mov.f32 %f375, %f30;
|
120 |
+
@%p5 bra $L__BB0_6;
|
121 |
+
mul.f32 %f375, %f10, %f10;
|
122 |
+
mov.f32 %f374, 0f38B1E96A;
|
123 |
+
mov.f32 %f373, 0fBA574D20;
|
124 |
+
mov.f32 %f372, 0f3BAAD5EA;
|
125 |
+
mov.f32 %f371, 0fBCDC1BE7;
|
126 |
+
mov.f32 %f370, 0f3DE718AF;
|
127 |
+
mov.f32 %f369, 0fBEC093AC;
|
128 |
+
mov.f32 %f368, 0f3E0375D3;
|
129 |
+
$L__BB0_6:
|
130 |
+
.loc 1 0 0
|
131 |
+
cvt.f32.bf16 %r9, %rs4;
|
132 |
+
mul.f32 %f11, %f3, 0f3F3504F3;
|
133 |
+
.loc 1 30 23
|
134 |
+
setp.ltu.f32 %p6, %f30, 0f3F8060FE;
|
135 |
+
fma.rn.ftz.f32 %f160, %f374, %f375, %f373;
|
136 |
+
fma.rn.ftz.f32 %f161, %f160, %f375, %f372;
|
137 |
+
fma.rn.ftz.f32 %f162, %f161, %f375, %f371;
|
138 |
+
fma.rn.ftz.f32 %f163, %f162, %f375, %f370;
|
139 |
+
fma.rn.ftz.f32 %f164, %f163, %f375, %f369;
|
140 |
+
fma.rn.ftz.f32 %f165, %f164, %f375, %f368;
|
141 |
+
neg.f32 %f166, %f375;
|
142 |
+
selp.f32 %f167, %f166, %f10, %p5;
|
143 |
+
fma.rn.ftz.f32 %f376, %f165, %f167, %f167;
|
144 |
+
@%p6 bra $L__BB0_8;
|
145 |
+
ex2.approx.ftz.f32 %f168, %f376;
|
146 |
+
sub.f32 %f170, %f358, %f168;
|
147 |
+
mov.b32 %r23, %f170;
|
148 |
+
mov.b32 %r24, %f10;
|
149 |
+
and.b32 %r25, %r24, -2147483648;
|
150 |
+
or.b32 %r26, %r25, %r23;
|
151 |
+
mov.b32 %f376, %r26;
|
152 |
+
$L__BB0_8:
|
153 |
+
.loc 1 0 0
|
154 |
+
cvt.u16.u32 %rs5, %r4;
|
155 |
+
mov.b32 %f4, %r9;
|
156 |
+
.loc 1 30 23
|
157 |
+
abs.ftz.f32 %f43, %f11;
|
158 |
+
setp.ge.f32 %p8, %f43, 0f3F8060FE;
|
159 |
+
mov.f32 %f383, 0f3789CA3C;
|
160 |
+
mov.f32 %f382, 0fB9F560B9;
|
161 |
+
mov.f32 %f381, 0f3BAC840B;
|
162 |
+
mov.f32 %f380, 0fBD0C8162;
|
163 |
+
mov.f32 %f379, 0f3E1CF906;
|
164 |
+
mov.f32 %f378, 0f3F6A937E;
|
165 |
+
mov.f32 %f377, 0f3F20D842;
|
166 |
+
mov.f32 %f384, %f43;
|
167 |
+
@%p8 bra $L__BB0_10;
|
168 |
+
mul.f32 %f384, %f11, %f11;
|
169 |
+
mov.f32 %f383, 0f38B1E96A;
|
170 |
+
mov.f32 %f382, 0fBA574D20;
|
171 |
+
mov.f32 %f381, 0f3BAAD5EA;
|
172 |
+
mov.f32 %f380, 0fBCDC1BE7;
|
173 |
+
mov.f32 %f379, 0f3DE718AF;
|
174 |
+
mov.f32 %f378, 0fBEC093AC;
|
175 |
+
mov.f32 %f377, 0f3E0375D3;
|
176 |
+
$L__BB0_10:
|
177 |
+
.loc 1 0 0
|
178 |
+
cvt.f32.bf16 %r10, %rs5;
|
179 |
+
mul.f32 %f12, %f4, 0f3F3504F3;
|
180 |
+
.loc 1 30 23
|
181 |
+
setp.ltu.f32 %p9, %f43, 0f3F8060FE;
|
182 |
+
fma.rn.ftz.f32 %f185, %f383, %f384, %f382;
|
183 |
+
fma.rn.ftz.f32 %f186, %f185, %f384, %f381;
|
184 |
+
fma.rn.ftz.f32 %f187, %f186, %f384, %f380;
|
185 |
+
fma.rn.ftz.f32 %f188, %f187, %f384, %f379;
|
186 |
+
fma.rn.ftz.f32 %f189, %f188, %f384, %f378;
|
187 |
+
fma.rn.ftz.f32 %f190, %f189, %f384, %f377;
|
188 |
+
neg.f32 %f191, %f384;
|
189 |
+
selp.f32 %f192, %f191, %f11, %p8;
|
190 |
+
fma.rn.ftz.f32 %f385, %f190, %f192, %f192;
|
191 |
+
@%p9 bra $L__BB0_12;
|
192 |
+
ex2.approx.ftz.f32 %f193, %f385;
|
193 |
+
sub.f32 %f195, %f358, %f193;
|
194 |
+
mov.b32 %r27, %f195;
|
195 |
+
mov.b32 %r28, %f11;
|
196 |
+
and.b32 %r29, %r28, -2147483648;
|
197 |
+
or.b32 %r30, %r29, %r27;
|
198 |
+
mov.b32 %f385, %r30;
|
199 |
+
$L__BB0_12:
|
200 |
+
.loc 1 0 0
|
201 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
|
202 |
+
mov.b32 %f5, %r10;
|
203 |
+
.loc 1 30 23
|
204 |
+
abs.ftz.f32 %f56, %f12;
|
205 |
+
setp.ge.f32 %p11, %f56, 0f3F8060FE;
|
206 |
+
mov.f32 %f392, 0f3789CA3C;
|
207 |
+
mov.f32 %f391, 0fB9F560B9;
|
208 |
+
mov.f32 %f390, 0f3BAC840B;
|
209 |
+
mov.f32 %f389, 0fBD0C8162;
|
210 |
+
mov.f32 %f388, 0f3E1CF906;
|
211 |
+
mov.f32 %f387, 0f3F6A937E;
|
212 |
+
mov.f32 %f386, 0f3F20D842;
|
213 |
+
mov.f32 %f393, %f56;
|
214 |
+
@%p11 bra $L__BB0_14;
|
215 |
+
mul.f32 %f393, %f12, %f12;
|
216 |
+
mov.f32 %f392, 0f38B1E96A;
|
217 |
+
mov.f32 %f391, 0fBA574D20;
|
218 |
+
mov.f32 %f390, 0f3BAAD5EA;
|
219 |
+
mov.f32 %f389, 0fBCDC1BE7;
|
220 |
+
mov.f32 %f388, 0f3DE718AF;
|
221 |
+
mov.f32 %f387, 0fBEC093AC;
|
222 |
+
mov.f32 %f386, 0f3E0375D3;
|
223 |
+
$L__BB0_14:
|
224 |
+
.loc 1 0 0
|
225 |
+
cvt.f32.bf16 %r11, %rs6;
|
226 |
+
mul.f32 %f13, %f5, 0f3F3504F3;
|
227 |
+
.loc 1 30 23
|
228 |
+
setp.ltu.f32 %p12, %f56, 0f3F8060FE;
|
229 |
+
fma.rn.ftz.f32 %f210, %f392, %f393, %f391;
|
230 |
+
fma.rn.ftz.f32 %f211, %f210, %f393, %f390;
|
231 |
+
fma.rn.ftz.f32 %f212, %f211, %f393, %f389;
|
232 |
+
fma.rn.ftz.f32 %f213, %f212, %f393, %f388;
|
233 |
+
fma.rn.ftz.f32 %f214, %f213, %f393, %f387;
|
234 |
+
fma.rn.ftz.f32 %f215, %f214, %f393, %f386;
|
235 |
+
neg.f32 %f216, %f393;
|
236 |
+
selp.f32 %f217, %f216, %f12, %p11;
|
237 |
+
fma.rn.ftz.f32 %f394, %f215, %f217, %f217;
|
238 |
+
@%p12 bra $L__BB0_16;
|
239 |
+
ex2.approx.ftz.f32 %f218, %f394;
|
240 |
+
sub.f32 %f220, %f358, %f218;
|
241 |
+
mov.b32 %r31, %f220;
|
242 |
+
mov.b32 %r32, %f12;
|
243 |
+
and.b32 %r33, %r32, -2147483648;
|
244 |
+
or.b32 %r34, %r33, %r31;
|
245 |
+
mov.b32 %f394, %r34;
|
246 |
+
$L__BB0_16:
|
247 |
+
.loc 1 0 0
|
248 |
+
cvt.u16.u32 %rs7, %r5;
|
249 |
+
mov.b32 %f6, %r11;
|
250 |
+
.loc 1 30 23
|
251 |
+
abs.ftz.f32 %f69, %f13;
|
252 |
+
setp.ge.f32 %p14, %f69, 0f3F8060FE;
|
253 |
+
mov.f32 %f401, 0f3789CA3C;
|
254 |
+
mov.f32 %f400, 0fB9F560B9;
|
255 |
+
mov.f32 %f399, 0f3BAC840B;
|
256 |
+
mov.f32 %f398, 0fBD0C8162;
|
257 |
+
mov.f32 %f397, 0f3E1CF906;
|
258 |
+
mov.f32 %f396, 0f3F6A937E;
|
259 |
+
mov.f32 %f395, 0f3F20D842;
|
260 |
+
mov.f32 %f402, %f69;
|
261 |
+
@%p14 bra $L__BB0_18;
|
262 |
+
mul.f32 %f402, %f13, %f13;
|
263 |
+
mov.f32 %f401, 0f38B1E96A;
|
264 |
+
mov.f32 %f400, 0fBA574D20;
|
265 |
+
mov.f32 %f399, 0f3BAAD5EA;
|
266 |
+
mov.f32 %f398, 0fBCDC1BE7;
|
267 |
+
mov.f32 %f397, 0f3DE718AF;
|
268 |
+
mov.f32 %f396, 0fBEC093AC;
|
269 |
+
mov.f32 %f395, 0f3E0375D3;
|
270 |
+
$L__BB0_18:
|
271 |
+
.loc 1 0 0
|
272 |
+
cvt.f32.bf16 %r12, %rs7;
|
273 |
+
mul.f32 %f14, %f6, 0f3F3504F3;
|
274 |
+
.loc 1 30 23
|
275 |
+
setp.ltu.f32 %p15, %f69, 0f3F8060FE;
|
276 |
+
fma.rn.ftz.f32 %f235, %f401, %f402, %f400;
|
277 |
+
fma.rn.ftz.f32 %f236, %f235, %f402, %f399;
|
278 |
+
fma.rn.ftz.f32 %f237, %f236, %f402, %f398;
|
279 |
+
fma.rn.ftz.f32 %f238, %f237, %f402, %f397;
|
280 |
+
fma.rn.ftz.f32 %f239, %f238, %f402, %f396;
|
281 |
+
fma.rn.ftz.f32 %f240, %f239, %f402, %f395;
|
282 |
+
neg.f32 %f241, %f402;
|
283 |
+
selp.f32 %f242, %f241, %f13, %p14;
|
284 |
+
fma.rn.ftz.f32 %f403, %f240, %f242, %f242;
|
285 |
+
@%p15 bra $L__BB0_20;
|
286 |
+
ex2.approx.ftz.f32 %f243, %f403;
|
287 |
+
sub.f32 %f245, %f358, %f243;
|
288 |
+
mov.b32 %r35, %f245;
|
289 |
+
mov.b32 %r36, %f13;
|
290 |
+
and.b32 %r37, %r36, -2147483648;
|
291 |
+
or.b32 %r38, %r37, %r35;
|
292 |
+
mov.b32 %f403, %r38;
|
293 |
+
$L__BB0_20:
|
294 |
+
.loc 1 0 0
|
295 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
|
296 |
+
mov.b32 %f7, %r12;
|
297 |
+
.loc 1 30 23
|
298 |
+
abs.ftz.f32 %f82, %f14;
|
299 |
+
setp.ge.f32 %p17, %f82, 0f3F8060FE;
|
300 |
+
mov.f32 %f410, 0f3789CA3C;
|
301 |
+
mov.f32 %f409, 0fB9F560B9;
|
302 |
+
mov.f32 %f408, 0f3BAC840B;
|
303 |
+
mov.f32 %f407, 0fBD0C8162;
|
304 |
+
mov.f32 %f406, 0f3E1CF906;
|
305 |
+
mov.f32 %f405, 0f3F6A937E;
|
306 |
+
mov.f32 %f404, 0f3F20D842;
|
307 |
+
mov.f32 %f411, %f82;
|
308 |
+
@%p17 bra $L__BB0_22;
|
309 |
+
mul.f32 %f411, %f14, %f14;
|
310 |
+
mov.f32 %f410, 0f38B1E96A;
|
311 |
+
mov.f32 %f409, 0fBA574D20;
|
312 |
+
mov.f32 %f408, 0f3BAAD5EA;
|
313 |
+
mov.f32 %f407, 0fBCDC1BE7;
|
314 |
+
mov.f32 %f406, 0f3DE718AF;
|
315 |
+
mov.f32 %f405, 0fBEC093AC;
|
316 |
+
mov.f32 %f404, 0f3E0375D3;
|
317 |
+
$L__BB0_22:
|
318 |
+
.loc 1 0 0
|
319 |
+
cvt.f32.bf16 %r13, %rs8;
|
320 |
+
mul.f32 %f15, %f7, 0f3F3504F3;
|
321 |
+
.loc 1 30 23
|
322 |
+
setp.ltu.f32 %p18, %f82, 0f3F8060FE;
|
323 |
+
fma.rn.ftz.f32 %f260, %f410, %f411, %f409;
|
324 |
+
fma.rn.ftz.f32 %f261, %f260, %f411, %f408;
|
325 |
+
fma.rn.ftz.f32 %f262, %f261, %f411, %f407;
|
326 |
+
fma.rn.ftz.f32 %f263, %f262, %f411, %f406;
|
327 |
+
fma.rn.ftz.f32 %f264, %f263, %f411, %f405;
|
328 |
+
fma.rn.ftz.f32 %f265, %f264, %f411, %f404;
|
329 |
+
neg.f32 %f266, %f411;
|
330 |
+
selp.f32 %f267, %f266, %f14, %p17;
|
331 |
+
fma.rn.ftz.f32 %f412, %f265, %f267, %f267;
|
332 |
+
@%p18 bra $L__BB0_24;
|
333 |
+
ex2.approx.ftz.f32 %f268, %f412;
|
334 |
+
sub.f32 %f270, %f358, %f268;
|
335 |
+
mov.b32 %r39, %f270;
|
336 |
+
mov.b32 %r40, %f14;
|
337 |
+
and.b32 %r41, %r40, -2147483648;
|
338 |
+
or.b32 %r42, %r41, %r39;
|
339 |
+
mov.b32 %f412, %r42;
|
340 |
+
$L__BB0_24:
|
341 |
+
.loc 1 0 0
|
342 |
+
mov.b32 %f8, %r13;
|
343 |
+
.loc 1 30 23
|
344 |
+
abs.ftz.f32 %f95, %f15;
|
345 |
+
setp.ge.f32 %p20, %f95, 0f3F8060FE;
|
346 |
+
mov.f32 %f419, 0f3789CA3C;
|
347 |
+
mov.f32 %f418, 0fB9F560B9;
|
348 |
+
mov.f32 %f417, 0f3BAC840B;
|
349 |
+
mov.f32 %f416, 0fBD0C8162;
|
350 |
+
mov.f32 %f415, 0f3E1CF906;
|
351 |
+
mov.f32 %f414, 0f3F6A937E;
|
352 |
+
mov.f32 %f413, 0f3F20D842;
|
353 |
+
mov.f32 %f420, %f95;
|
354 |
+
@%p20 bra $L__BB0_26;
|
355 |
+
mul.f32 %f420, %f15, %f15;
|
356 |
+
mov.f32 %f419, 0f38B1E96A;
|
357 |
+
mov.f32 %f418, 0fBA574D20;
|
358 |
+
mov.f32 %f417, 0f3BAAD5EA;
|
359 |
+
mov.f32 %f416, 0fBCDC1BE7;
|
360 |
+
mov.f32 %f415, 0f3DE718AF;
|
361 |
+
mov.f32 %f414, 0fBEC093AC;
|
362 |
+
mov.f32 %f413, 0f3E0375D3;
|
363 |
+
$L__BB0_26:
|
364 |
+
.loc 1 0 0
|
365 |
+
mul.f32 %f16, %f8, 0f3F3504F3;
|
366 |
+
.loc 1 30 23
|
367 |
+
setp.ltu.f32 %p21, %f95, 0f3F8060FE;
|
368 |
+
fma.rn.ftz.f32 %f285, %f419, %f420, %f418;
|
369 |
+
fma.rn.ftz.f32 %f286, %f285, %f420, %f417;
|
370 |
+
fma.rn.ftz.f32 %f287, %f286, %f420, %f416;
|
371 |
+
fma.rn.ftz.f32 %f288, %f287, %f420, %f415;
|
372 |
+
fma.rn.ftz.f32 %f289, %f288, %f420, %f414;
|
373 |
+
fma.rn.ftz.f32 %f290, %f289, %f420, %f413;
|
374 |
+
neg.f32 %f291, %f420;
|
375 |
+
selp.f32 %f292, %f291, %f15, %p20;
|
376 |
+
fma.rn.ftz.f32 %f421, %f290, %f292, %f292;
|
377 |
+
@%p21 bra $L__BB0_28;
|
378 |
+
ex2.approx.ftz.f32 %f293, %f421;
|
379 |
+
sub.f32 %f295, %f358, %f293;
|
380 |
+
mov.b32 %r43, %f295;
|
381 |
+
mov.b32 %r44, %f15;
|
382 |
+
and.b32 %r45, %r44, -2147483648;
|
383 |
+
or.b32 %r46, %r45, %r43;
|
384 |
+
mov.b32 %f421, %r46;
|
385 |
+
$L__BB0_28:
|
386 |
+
abs.ftz.f32 %f108, %f16;
|
387 |
+
setp.ge.f32 %p23, %f108, 0f3F8060FE;
|
388 |
+
mov.f32 %f428, 0f3789CA3C;
|
389 |
+
mov.f32 %f427, 0fB9F560B9;
|
390 |
+
mov.f32 %f426, 0f3BAC840B;
|
391 |
+
mov.f32 %f425, 0fBD0C8162;
|
392 |
+
mov.f32 %f424, 0f3E1CF906;
|
393 |
+
mov.f32 %f423, 0f3F6A937E;
|
394 |
+
mov.f32 %f422, 0f3F20D842;
|
395 |
+
mov.f32 %f429, %f108;
|
396 |
+
@%p23 bra $L__BB0_30;
|
397 |
+
mul.f32 %f429, %f16, %f16;
|
398 |
+
mov.f32 %f428, 0f38B1E96A;
|
399 |
+
mov.f32 %f427, 0fBA574D20;
|
400 |
+
mov.f32 %f426, 0f3BAAD5EA;
|
401 |
+
mov.f32 %f425, 0fBCDC1BE7;
|
402 |
+
mov.f32 %f424, 0f3DE718AF;
|
403 |
+
mov.f32 %f423, 0fBEC093AC;
|
404 |
+
mov.f32 %f422, 0f3E0375D3;
|
405 |
+
$L__BB0_30:
|
406 |
+
setp.ltu.f32 %p24, %f108, 0f3F8060FE;
|
407 |
+
fma.rn.ftz.f32 %f310, %f428, %f429, %f427;
|
408 |
+
fma.rn.ftz.f32 %f311, %f310, %f429, %f426;
|
409 |
+
fma.rn.ftz.f32 %f312, %f311, %f429, %f425;
|
410 |
+
fma.rn.ftz.f32 %f313, %f312, %f429, %f424;
|
411 |
+
fma.rn.ftz.f32 %f314, %f313, %f429, %f423;
|
412 |
+
fma.rn.ftz.f32 %f315, %f314, %f429, %f422;
|
413 |
+
neg.f32 %f316, %f429;
|
414 |
+
selp.f32 %f317, %f316, %f16, %p23;
|
415 |
+
fma.rn.ftz.f32 %f430, %f315, %f317, %f317;
|
416 |
+
@%p24 bra $L__BB0_32;
|
417 |
+
ex2.approx.ftz.f32 %f318, %f430;
|
418 |
+
sub.f32 %f320, %f358, %f318;
|
419 |
+
mov.b32 %r47, %f320;
|
420 |
+
mov.b32 %r48, %f16;
|
421 |
+
and.b32 %r49, %r48, -2147483648;
|
422 |
+
or.b32 %r50, %r49, %r47;
|
423 |
+
mov.b32 %f430, %r50;
|
424 |
+
$L__BB0_32:
|
425 |
+
.loc 1 27 18
|
426 |
+
mul.f32 %f321, %f8, 0f3F000000;
|
427 |
+
mul.f32 %f322, %f7, 0f3F000000;
|
428 |
+
mul.f32 %f323, %f6, 0f3F000000;
|
429 |
+
mul.f32 %f324, %f5, 0f3F000000;
|
430 |
+
mul.f32 %f325, %f4, 0f3F000000;
|
431 |
+
mul.f32 %f326, %f3, 0f3F000000;
|
432 |
+
mul.f32 %f327, %f2, 0f3F000000;
|
433 |
+
mul.f32 %f328, %f1, 0f3F000000;
|
434 |
+
.loc 1 32 18
|
435 |
+
add.f32 %f329, %f367, 0f3F800000;
|
436 |
+
add.f32 %f330, %f376, 0f3F800000;
|
437 |
+
add.f32 %f331, %f385, 0f3F800000;
|
438 |
+
add.f32 %f332, %f394, 0f3F800000;
|
439 |
+
add.f32 %f333, %f403, 0f3F800000;
|
440 |
+
add.f32 %f334, %f412, 0f3F800000;
|
441 |
+
add.f32 %f335, %f421, 0f3F800000;
|
442 |
+
add.f32 %f336, %f430, 0f3F800000;
|
443 |
+
.loc 1 33 18
|
444 |
+
mul.f32 %f337, %f328, %f329;
|
445 |
+
mul.f32 %f338, %f327, %f330;
|
446 |
+
mul.f32 %f339, %f326, %f331;
|
447 |
+
mul.f32 %f340, %f325, %f332;
|
448 |
+
mul.f32 %f341, %f324, %f333;
|
449 |
+
mul.f32 %f342, %f323, %f334;
|
450 |
+
mul.f32 %f343, %f322, %f335;
|
451 |
+
mul.f32 %f344, %f321, %f336;
|
452 |
+
.loc 1 35 40
|
453 |
+
mov.b32 %r51, %f337;
|
454 |
+
cvt.rn.bf16.f32 %rs9, %r51;
|
455 |
+
mov.b32 %r52, %f338;
|
456 |
+
cvt.rn.bf16.f32 %rs10, %r52;
|
457 |
+
mov.b32 %r53, %f339;
|
458 |
+
cvt.rn.bf16.f32 %rs11, %r53;
|
459 |
+
mov.b32 %r54, %f340;
|
460 |
+
cvt.rn.bf16.f32 %rs12, %r54;
|
461 |
+
mov.b32 %r55, %f341;
|
462 |
+
cvt.rn.bf16.f32 %rs13, %r55;
|
463 |
+
mov.b32 %r56, %f342;
|
464 |
+
cvt.rn.bf16.f32 %rs14, %r56;
|
465 |
+
mov.b32 %r57, %f343;
|
466 |
+
cvt.rn.bf16.f32 %rs15, %r57;
|
467 |
+
mov.b32 %r58, %f344;
|
468 |
+
cvt.rn.bf16.f32 %rs16, %r58;
|
469 |
+
mov.b32 %r63, {%rs9, %rs10};
|
470 |
+
mov.b32 %r64, {%rs11, %rs12};
|
471 |
+
mov.b32 %r65, {%rs13, %rs14};
|
472 |
+
mov.b32 %r66, {%rs15, %rs16};
|
473 |
+
@%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r63, %r64, %r65, %r66 };
|
474 |
+
.loc 1 35 4
|
475 |
+
ret;
|
476 |
+
$L__tmp1:
|
477 |
+
$L__func_end0:
|
478 |
+
|
479 |
+
}
|
480 |
+
// .globl __nv_erff
|
481 |
+
.visible .func (.param .b32 func_retval0) __nv_erff(
|
482 |
+
.param .b32 __nv_erff_param_0
|
483 |
+
)
|
484 |
+
{
|
485 |
+
.reg .pred %p<4>;
|
486 |
+
.reg .b32 %r<5>;
|
487 |
+
.reg .f32 %f<49>;
|
488 |
+
$L__func_begin1:
|
489 |
+
|
490 |
+
ld.param.f32 %f14, [__nv_erff_param_0];
|
491 |
+
abs.ftz.f32 %f1, %f14;
|
492 |
+
setp.ge.f32 %p1, %f1, 0f3F8060FE;
|
493 |
+
mov.f32 %f46, 0f3789CA3C;
|
494 |
+
mov.f32 %f45, 0fB9F560B9;
|
495 |
+
mov.f32 %f44, 0f3BAC840B;
|
496 |
+
mov.f32 %f43, 0fBD0C8162;
|
497 |
+
mov.f32 %f42, 0f3E1CF906;
|
498 |
+
mov.f32 %f41, 0f3F6A937E;
|
499 |
+
mov.f32 %f40, 0f3F20D842;
|
500 |
+
mov.f32 %f47, %f1;
|
501 |
+
@%p1 bra $L__BB1_2;
|
502 |
+
mul.f32 %f47, %f14, %f14;
|
503 |
+
mov.f32 %f46, 0f38B1E96A;
|
504 |
+
mov.f32 %f45, 0fBA574D20;
|
505 |
+
mov.f32 %f44, 0f3BAAD5EA;
|
506 |
+
mov.f32 %f43, 0fBCDC1BE7;
|
507 |
+
mov.f32 %f42, 0f3DE718AF;
|
508 |
+
mov.f32 %f41, 0fBEC093AC;
|
509 |
+
mov.f32 %f40, 0f3E0375D3;
|
510 |
+
$L__BB1_2:
|
511 |
+
setp.ltu.f32 %p2, %f1, 0f3F8060FE;
|
512 |
+
fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
|
513 |
+
fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
|
514 |
+
fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
|
515 |
+
fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
|
516 |
+
fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
|
517 |
+
fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
|
518 |
+
neg.f32 %f35, %f47;
|
519 |
+
selp.f32 %f36, %f35, %f14, %p1;
|
520 |
+
fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
|
521 |
+
@%p2 bra $L__BB1_4;
|
522 |
+
ex2.approx.ftz.f32 %f37, %f48;
|
523 |
+
mov.f32 %f38, 0f3F800000;
|
524 |
+
sub.f32 %f39, %f38, %f37;
|
525 |
+
mov.b32 %r1, %f39;
|
526 |
+
mov.b32 %r2, %f14;
|
527 |
+
and.b32 %r3, %r2, -2147483648;
|
528 |
+
or.b32 %r4, %r3, %r1;
|
529 |
+
mov.b32 %f48, %r4;
|
530 |
+
$L__BB1_4:
|
531 |
+
st.param.f32 [func_retval0+0], %f48;
|
532 |
+
ret;
|
533 |
+
$L__func_end1:
|
534 |
+
|
535 |
+
}
|
536 |
+
.file 1 "/tmp/torchinductor_root/af/cafucwnmq4o436kwzkmrinerrnocxll7q6wsadcl726g6cradipo.py"
|
537 |
+
.section .debug_abbrev
|
538 |
+
{
|
539 |
+
.b8 1
|
540 |
+
.b8 17
|
541 |
+
.b8 1
|
542 |
+
.b8 37
|
543 |
+
.b8 8
|
544 |
+
.b8 19
|
545 |
+
.b8 5
|
546 |
+
.b8 3
|
547 |
+
.b8 8
|
548 |
+
.b8 16
|
549 |
+
.b8 6
|
550 |
+
.b8 27
|
551 |
+
.b8 8
|
552 |
+
.b8 180
|
553 |
+
.b8 66
|
554 |
+
.b8 12
|
555 |
+
.b8 17
|
556 |
+
.b8 1
|
557 |
+
.b8 18
|
558 |
+
.b8 1
|
559 |
+
.b8 0
|
560 |
+
.b8 0
|
561 |
+
.b8 2
|
562 |
+
.b8 46
|
563 |
+
.b8 0
|
564 |
+
.b8 17
|
565 |
+
.b8 1
|
566 |
+
.b8 18
|
567 |
+
.b8 1
|
568 |
+
.b8 64
|
569 |
+
.b8 10
|
570 |
+
.b8 135
|
571 |
+
.b8 64
|
572 |
+
.b8 8
|
573 |
+
.b8 3
|
574 |
+
.b8 8
|
575 |
+
.b8 58
|
576 |
+
.b8 11
|
577 |
+
.b8 59
|
578 |
+
.b8 11
|
579 |
+
.b8 63
|
580 |
+
.b8 12
|
581 |
+
.b8 0
|
582 |
+
.b8 0
|
583 |
+
.b8 0
|
584 |
+
}
|
585 |
+
.section .debug_info
|
586 |
+
{
|
587 |
+
.b32 172
|
588 |
+
.b8 2
|
589 |
+
.b8 0
|
590 |
+
.b32 .debug_abbrev
|
591 |
+
.b8 8
|
592 |
+
.b8 1
|
593 |
+
.b8 116
|
594 |
+
.b8 114
|
595 |
+
.b8 105
|
596 |
+
.b8 116
|
597 |
+
.b8 111
|
598 |
+
.b8 110
|
599 |
+
.b8 0
|
600 |
+
.b8 2
|
601 |
+
.b8 0
|
602 |
+
.b8 99
|
603 |
+
.b8 97
|
604 |
+
.b8 102
|
605 |
+
.b8 117
|
606 |
+
.b8 99
|
607 |
+
.b8 119
|
608 |
+
.b8 110
|
609 |
+
.b8 109
|
610 |
+
.b8 113
|
611 |
+
.b8 52
|
612 |
+
.b8 111
|
613 |
+
.b8 52
|
614 |
+
.b8 51
|
615 |
+
.b8 54
|
616 |
+
.b8 107
|
617 |
+
.b8 119
|
618 |
+
.b8 122
|
619 |
+
.b8 107
|
620 |
+
.b8 109
|
621 |
+
.b8 114
|
622 |
+
.b8 105
|
623 |
+
.b8 110
|
624 |
+
.b8 101
|
625 |
+
.b8 114
|
626 |
+
.b8 114
|
627 |
+
.b8 110
|
628 |
+
.b8 111
|
629 |
+
.b8 99
|
630 |
+
.b8 120
|
631 |
+
.b8 108
|
632 |
+
.b8 108
|
633 |
+
.b8 55
|
634 |
+
.b8 113
|
635 |
+
.b8 54
|
636 |
+
.b8 119
|
637 |
+
.b8 115
|
638 |
+
.b8 97
|
639 |
+
.b8 100
|
640 |
+
.b8 99
|
641 |
+
.b8 108
|
642 |
+
.b8 55
|
643 |
+
.b8 50
|
644 |
+
.b8 54
|
645 |
+
.b8 103
|
646 |
+
.b8 54
|
647 |
+
.b8 99
|
648 |
+
.b8 114
|
649 |
+
.b8 97
|
650 |
+
.b8 100
|
651 |
+
.b8 105
|
652 |
+
.b8 112
|
653 |
+
.b8 111
|
654 |
+
.b8 46
|
655 |
+
.b8 112
|
656 |
+
.b8 121
|
657 |
+
.b8 0
|
658 |
+
.b32 .debug_line
|
659 |
+
.b8 47
|
660 |
+
.b8 116
|
661 |
+
.b8 109
|
662 |
+
.b8 112
|
663 |
+
.b8 47
|
664 |
+
.b8 116
|
665 |
+
.b8 111
|
666 |
+
.b8 114
|
667 |
+
.b8 99
|
668 |
+
.b8 104
|
669 |
+
.b8 105
|
670 |
+
.b8 110
|
671 |
+
.b8 100
|
672 |
+
.b8 117
|
673 |
+
.b8 99
|
674 |
+
.b8 116
|
675 |
+
.b8 111
|
676 |
+
.b8 114
|
677 |
+
.b8 95
|
678 |
+
.b8 114
|
679 |
+
.b8 111
|
680 |
+
.b8 111
|
681 |
+
.b8 116
|
682 |
+
.b8 47
|
683 |
+
.b8 97
|
684 |
+
.b8 102
|
685 |
+
.b8 0
|
686 |
+
.b8 1
|
687 |
+
.b64 $L__func_begin0
|
688 |
+
.b64 $L__func_end0
|
689 |
+
.b8 2
|
690 |
+
.b64 $L__func_begin0
|
691 |
+
.b64 $L__func_end0
|
692 |
+
.b8 1
|
693 |
+
.b8 156
|
694 |
+
.b8 116
|
695 |
+
.b8 114
|
696 |
+
.b8 105
|
697 |
+
.b8 116
|
698 |
+
.b8 111
|
699 |
+
.b8 110
|
700 |
+
.b8 95
|
701 |
+
.b8 95
|
702 |
+
.b8 48
|
703 |
+
.b8 100
|
704 |
+
.b8 49
|
705 |
+
.b8 100
|
706 |
+
.b8 101
|
707 |
+
.b8 0
|
708 |
+
.b8 116
|
709 |
+
.b8 114
|
710 |
+
.b8 105
|
711 |
+
.b8 116
|
712 |
+
.b8 111
|
713 |
+
.b8 110
|
714 |
+
.b8 95
|
715 |
+
.b8 95
|
716 |
+
.b8 48
|
717 |
+
.b8 100
|
718 |
+
.b8 49
|
719 |
+
.b8 100
|
720 |
+
.b8 101
|
721 |
+
.b8 0
|
722 |
+
.b8 1
|
723 |
+
.b8 18
|
724 |
+
.b8 1
|
725 |
+
.b8 0
|
726 |
+
}
|
727 |
+
.section .debug_pubnames
|
728 |
+
{
|
729 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
730 |
+
$L__pubNames_start0:
|
731 |
+
.b8 2
|
732 |
+
.b8 0
|
733 |
+
.b32 .debug_info
|
734 |
+
.b32 176
|
735 |
+
.b32 125
|
736 |
+
.b8 116
|
737 |
+
.b8 114
|
738 |
+
.b8 105
|
739 |
+
.b8 116
|
740 |
+
.b8 111
|
741 |
+
.b8 110
|
742 |
+
.b8 95
|
743 |
+
.b8 95
|
744 |
+
.b8 48
|
745 |
+
.b8 100
|
746 |
+
.b8 49
|
747 |
+
.b8 100
|
748 |
+
.b8 101
|
749 |
+
.b8 0
|
750 |
+
.b32 0
|
751 |
+
$L__pubNames_end0:
|
752 |
+
}
|
753 |
+
.section .debug_pubtypes
|
754 |
+
{
|
755 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
756 |
+
$L__pubTypes_start0:
|
757 |
+
.b8 2
|
758 |
+
.b8 0
|
759 |
+
.b32 .debug_info
|
760 |
+
.b32 176
|
761 |
+
.b32 0
|
762 |
+
$L__pubTypes_end0:
|
763 |
+
}
|
764 |
+
.section .debug_loc { }
|
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked>
|
7 |
+
%c1024_i32 = arith.constant 1024 : i32
|
8 |
+
%0 = tt.get_program_id x : i32
|
9 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
10 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
11 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
12 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
|
13 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
14 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
15 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
16 |
+
%8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
|
17 |
+
%9 = arith.mulf %8, %cst_1 : tensor<1024xf32, #blocked>
|
18 |
+
%10 = arith.mulf %8, %cst_0 : tensor<1024xf32, #blocked>
|
19 |
+
%11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
|
20 |
+
%12 = arith.addf %11, %cst : tensor<1024xf32, #blocked>
|
21 |
+
%13 = arith.mulf %9, %12 : tensor<1024xf32, #blocked>
|
22 |
+
%14 = arith.truncf %13 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
|
23 |
+
tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
|
24 |
+
tt.return
|
25 |
+
}
|
26 |
+
}
|
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<1.000000e+00> : tensor<1024xf32>
|
4 |
+
%cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32>
|
5 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
|
6 |
+
%c1024_i32 = arith.constant 1024 : i32
|
7 |
+
%0 = tt.get_program_id x : i32
|
8 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
9 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
10 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
11 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
12 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
13 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
14 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
15 |
+
%8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
|
16 |
+
%9 = arith.mulf %8, %cst_1 : tensor<1024xf32>
|
17 |
+
%10 = arith.mulf %8, %cst_0 : tensor<1024xf32>
|
18 |
+
%11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
|
19 |
+
%12 = arith.addf %11, %cst : tensor<1024xf32>
|
20 |
+
%13 = arith.mulf %9, %12 : tensor<1024xf32>
|
21 |
+
%14 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16>
|
22 |
+
tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
|
23 |
+
tt.return
|
24 |
+
}
|
25 |
+
}
|
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin
ADDED
Binary file (14.6 kB). View file
|
|
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
|
7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%7 = and i32 %6, 31, !dbg !8
|
9 |
+
%8 = lshr i32 %6, 5, !dbg !8
|
10 |
+
%9 = and i32 %6, 7, !dbg !8
|
11 |
+
%10 = shl nuw nsw i32 %9, 2, !dbg !8
|
12 |
+
%11 = and i32 %8, 7, !dbg !9
|
13 |
+
%12 = lshr i32 %7, 3, !dbg !9
|
14 |
+
%13 = shl nuw nsw i32 %11, 2, !dbg !9
|
15 |
+
%14 = or i32 %13, %12, !dbg !9
|
16 |
+
%15 = or i32 %14, 96, !dbg !9
|
17 |
+
%16 = or i32 %10, 1, !dbg !10
|
18 |
+
%17 = or i32 %10, 2, !dbg !10
|
19 |
+
%18 = or i32 %10, 3, !dbg !10
|
20 |
+
%19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !14
|
21 |
+
%20 = shl i32 %19, 5, !dbg !15
|
22 |
+
%21 = or i32 %20, %10, !dbg !16
|
23 |
+
%22 = or i32 %20, %7, !dbg !16
|
24 |
+
%23 = icmp ult i32 %15, 120, !dbg !17
|
25 |
+
%24 = shl nuw nsw i32 %14, 17, !dbg !18
|
26 |
+
%25 = or i32 %24, 4194304, !dbg !18
|
27 |
+
%26 = or i32 %24, 8388608, !dbg !18
|
28 |
+
%27 = shl nuw nsw i32 %15, 17, !dbg !18
|
29 |
+
%28 = add i32 %21, %24, !dbg !19
|
30 |
+
%29 = add i32 %25, %21, !dbg !19
|
31 |
+
%30 = add i32 %26, %21, !dbg !19
|
32 |
+
%31 = add i32 %21, %27, !dbg !19
|
33 |
+
%32 = sext i32 %28 to i64, !dbg !20
|
34 |
+
%33 = getelementptr float, ptr addrspace(1) %0, i64 %32, !dbg !20
|
35 |
+
%34 = sext i32 %29 to i64, !dbg !20
|
36 |
+
%35 = getelementptr float, ptr addrspace(1) %0, i64 %34, !dbg !20
|
37 |
+
%36 = sext i32 %30 to i64, !dbg !20
|
38 |
+
%37 = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !20
|
39 |
+
%38 = sext i32 %31 to i64, !dbg !20
|
40 |
+
%39 = getelementptr float, ptr addrspace(1) %0, i64 %38, !dbg !20
|
41 |
+
%40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %33, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
|
42 |
+
%41 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !21
|
43 |
+
%42 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !21
|
44 |
+
%43 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !21
|
45 |
+
%44 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !21
|
46 |
+
%45 = bitcast i32 %41 to float, !dbg !21
|
47 |
+
%46 = bitcast i32 %42 to float, !dbg !21
|
48 |
+
%47 = bitcast i32 %43 to float, !dbg !21
|
49 |
+
%48 = bitcast i32 %44 to float, !dbg !21
|
50 |
+
%49 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
|
51 |
+
%50 = extractvalue { i32, i32, i32, i32 } %49, 0, !dbg !21
|
52 |
+
%51 = extractvalue { i32, i32, i32, i32 } %49, 1, !dbg !21
|
53 |
+
%52 = extractvalue { i32, i32, i32, i32 } %49, 2, !dbg !21
|
54 |
+
%53 = extractvalue { i32, i32, i32, i32 } %49, 3, !dbg !21
|
55 |
+
%54 = bitcast i32 %50 to float, !dbg !21
|
56 |
+
%55 = bitcast i32 %51 to float, !dbg !21
|
57 |
+
%56 = bitcast i32 %52 to float, !dbg !21
|
58 |
+
%57 = bitcast i32 %53 to float, !dbg !21
|
59 |
+
%58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
|
60 |
+
%59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !21
|
61 |
+
%60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !21
|
62 |
+
%61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !21
|
63 |
+
%62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !21
|
64 |
+
%63 = bitcast i32 %59 to float, !dbg !21
|
65 |
+
%64 = bitcast i32 %60 to float, !dbg !21
|
66 |
+
%65 = bitcast i32 %61 to float, !dbg !21
|
67 |
+
%66 = bitcast i32 %62 to float, !dbg !21
|
68 |
+
%67 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23) #3, !dbg !21
|
69 |
+
%68 = extractvalue { i32, i32, i32, i32 } %67, 0, !dbg !21
|
70 |
+
%69 = extractvalue { i32, i32, i32, i32 } %67, 1, !dbg !21
|
71 |
+
%70 = extractvalue { i32, i32, i32, i32 } %67, 2, !dbg !21
|
72 |
+
%71 = extractvalue { i32, i32, i32, i32 } %67, 3, !dbg !21
|
73 |
+
%72 = bitcast i32 %68 to float, !dbg !21
|
74 |
+
%73 = bitcast i32 %69 to float, !dbg !21
|
75 |
+
%74 = bitcast i32 %70 to float, !dbg !21
|
76 |
+
%75 = bitcast i32 %71 to float, !dbg !21
|
77 |
+
%76 = fadd float %45, 0.000000e+00, !dbg !22
|
78 |
+
%77 = fadd float %46, 0.000000e+00, !dbg !22
|
79 |
+
%78 = fadd float %47, 0.000000e+00, !dbg !22
|
80 |
+
%79 = fadd float %48, 0.000000e+00, !dbg !22
|
81 |
+
%80 = fadd float %54, 0.000000e+00, !dbg !22
|
82 |
+
%81 = fadd float %55, 0.000000e+00, !dbg !22
|
83 |
+
%82 = fadd float %56, 0.000000e+00, !dbg !22
|
84 |
+
%83 = fadd float %57, 0.000000e+00, !dbg !22
|
85 |
+
%84 = fadd float %63, 0.000000e+00, !dbg !22
|
86 |
+
%85 = fadd float %64, 0.000000e+00, !dbg !22
|
87 |
+
%86 = fadd float %65, 0.000000e+00, !dbg !22
|
88 |
+
%87 = fadd float %66, 0.000000e+00, !dbg !22
|
89 |
+
%88 = fadd float %72, 0.000000e+00, !dbg !22
|
90 |
+
%89 = fadd float %73, 0.000000e+00, !dbg !22
|
91 |
+
%90 = fadd float %74, 0.000000e+00, !dbg !22
|
92 |
+
%91 = fadd float %75, 0.000000e+00, !dbg !22
|
93 |
+
%92 = select i1 %23, float %88, float 0.000000e+00, !dbg !23
|
94 |
+
%93 = select i1 %23, float %89, float 0.000000e+00, !dbg !23
|
95 |
+
%94 = select i1 %23, float %90, float 0.000000e+00, !dbg !23
|
96 |
+
%95 = select i1 %23, float %91, float 0.000000e+00, !dbg !23
|
97 |
+
%96 = fadd float %76, %80, !dbg !24
|
98 |
+
%97 = fadd float %77, %81, !dbg !24
|
99 |
+
%98 = fadd float %78, %82, !dbg !24
|
100 |
+
%99 = fadd float %79, %83, !dbg !24
|
101 |
+
%100 = fadd float %96, %84, !dbg !24
|
102 |
+
%101 = fadd float %97, %85, !dbg !24
|
103 |
+
%102 = fadd float %98, %86, !dbg !24
|
104 |
+
%103 = fadd float %99, %87, !dbg !24
|
105 |
+
%104 = fadd float %100, %92, !dbg !24
|
106 |
+
%105 = fadd float %101, %93, !dbg !24
|
107 |
+
%106 = fadd float %102, %94, !dbg !24
|
108 |
+
%107 = fadd float %103, %95, !dbg !24
|
109 |
+
%108 = bitcast float %104 to i32, !dbg !10
|
110 |
+
%109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !10
|
111 |
+
%110 = bitcast i32 %109 to float, !dbg !10
|
112 |
+
%111 = fadd float %104, %110, !dbg !24
|
113 |
+
%112 = bitcast float %111 to i32, !dbg !10
|
114 |
+
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !10
|
115 |
+
%114 = bitcast i32 %113 to float, !dbg !10
|
116 |
+
%115 = fadd float %111, %114, !dbg !24
|
117 |
+
%116 = bitcast float %105 to i32, !dbg !10
|
118 |
+
%117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !10
|
119 |
+
%118 = bitcast i32 %117 to float, !dbg !10
|
120 |
+
%119 = fadd float %105, %118, !dbg !24
|
121 |
+
%120 = bitcast float %119 to i32, !dbg !10
|
122 |
+
%121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 8, i32 31), !dbg !10
|
123 |
+
%122 = bitcast i32 %121 to float, !dbg !10
|
124 |
+
%123 = fadd float %119, %122, !dbg !24
|
125 |
+
%124 = bitcast float %106 to i32, !dbg !10
|
126 |
+
%125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 16, i32 31), !dbg !10
|
127 |
+
%126 = bitcast i32 %125 to float, !dbg !10
|
128 |
+
%127 = fadd float %106, %126, !dbg !24
|
129 |
+
%128 = bitcast float %127 to i32, !dbg !10
|
130 |
+
%129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !10
|
131 |
+
%130 = bitcast i32 %129 to float, !dbg !10
|
132 |
+
%131 = fadd float %127, %130, !dbg !24
|
133 |
+
%132 = bitcast float %107 to i32, !dbg !10
|
134 |
+
%133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !10
|
135 |
+
%134 = bitcast i32 %133 to float, !dbg !10
|
136 |
+
%135 = fadd float %107, %134, !dbg !24
|
137 |
+
%136 = bitcast float %135 to i32, !dbg !10
|
138 |
+
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !10
|
139 |
+
%138 = bitcast i32 %137 to float, !dbg !10
|
140 |
+
%139 = fadd float %135, %138, !dbg !24
|
141 |
+
%140 = icmp ult i32 %7, 8, !dbg !10
|
142 |
+
%141 = shl nuw nsw i32 %9, 5, !dbg !10
|
143 |
+
%142 = or i32 %141, %11, !dbg !10
|
144 |
+
%143 = zext nneg i32 %142 to i64, !dbg !10
|
145 |
+
%144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !10
|
146 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %144, float %115, i1 %140) #3, !dbg !10
|
147 |
+
%145 = shl nuw nsw i32 %16, 3, !dbg !10
|
148 |
+
%146 = or i32 %145, %11, !dbg !10
|
149 |
+
%147 = zext nneg i32 %146 to i64, !dbg !10
|
150 |
+
%148 = getelementptr float, ptr addrspace(3) @global_smem, i64 %147, !dbg !10
|
151 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %148, float %123, i1 %140) #3, !dbg !10
|
152 |
+
%149 = shl nuw nsw i32 %17, 3, !dbg !10
|
153 |
+
%150 = or i32 %149, %11, !dbg !10
|
154 |
+
%151 = zext nneg i32 %150 to i64, !dbg !10
|
155 |
+
%152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !10
|
156 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %152, float %131, i1 %140) #3, !dbg !10
|
157 |
+
%153 = shl nuw nsw i32 %18, 3, !dbg !10
|
158 |
+
%154 = or i32 %153, %11, !dbg !10
|
159 |
+
%155 = zext nneg i32 %154 to i64, !dbg !10
|
160 |
+
%156 = getelementptr float, ptr addrspace(3) @global_smem, i64 %155, !dbg !10
|
161 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %156, float %139, i1 %140) #3, !dbg !10
|
162 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !10
|
163 |
+
%157 = icmp slt i32 %6, 256, !dbg !10
|
164 |
+
%158 = sext i32 %6 to i64, !dbg !10
|
165 |
+
%159 = getelementptr float, ptr addrspace(3) @global_smem, i64 %158, !dbg !10
|
166 |
+
%160 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %159, i1 %157) #3, !dbg !10
|
167 |
+
%161 = bitcast float %160 to i32, !dbg !10
|
168 |
+
%162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !10
|
169 |
+
%163 = bitcast i32 %162 to float, !dbg !10
|
170 |
+
%164 = fadd float %160, %163, !dbg !24
|
171 |
+
%165 = bitcast float %164 to i32, !dbg !10
|
172 |
+
%166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !10
|
173 |
+
%167 = bitcast i32 %166 to float, !dbg !10
|
174 |
+
%168 = fadd float %164, %167, !dbg !24
|
175 |
+
%169 = bitcast float %168 to i32, !dbg !10
|
176 |
+
%170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 1, i32 31), !dbg !10
|
177 |
+
%171 = bitcast i32 %170 to float, !dbg !10
|
178 |
+
%172 = fadd float %168, %171, !dbg !24
|
179 |
+
%173 = icmp eq i32 %9, 0, !dbg !10
|
180 |
+
%174 = and i1 %157, %173, !dbg !10
|
181 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %159, float %172, i1 %174) #3, !dbg !10
|
182 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !10
|
183 |
+
%175 = zext nneg i32 %141 to i64, !dbg !10
|
184 |
+
%176 = getelementptr float, ptr addrspace(3) @global_smem, i64 %175, !dbg !10
|
185 |
+
%177 = load float, ptr addrspace(3) %176, align 4, !dbg !10
|
186 |
+
%178 = zext nneg i32 %145 to i64, !dbg !10
|
187 |
+
%179 = getelementptr float, ptr addrspace(3) @global_smem, i64 %178, !dbg !10
|
188 |
+
%180 = load float, ptr addrspace(3) %179, align 4, !dbg !10
|
189 |
+
%181 = zext nneg i32 %149 to i64, !dbg !10
|
190 |
+
%182 = getelementptr float, ptr addrspace(3) @global_smem, i64 %181, !dbg !10
|
191 |
+
%183 = load float, ptr addrspace(3) %182, align 4, !dbg !10
|
192 |
+
%184 = zext nneg i32 %153 to i64, !dbg !10
|
193 |
+
%185 = getelementptr float, ptr addrspace(3) @global_smem, i64 %184, !dbg !10
|
194 |
+
%186 = load float, ptr addrspace(3) %185, align 4, !dbg !10
|
195 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
196 |
+
%187 = zext nneg i32 %10 to i64, !dbg !28
|
197 |
+
%188 = getelementptr float, ptr addrspace(3) @global_smem, i64 %187, !dbg !28
|
198 |
+
%189 = insertelement <1 x float> undef, float %177, i64 0, !dbg !28
|
199 |
+
store <1 x float> %189, ptr addrspace(3) %188, align 4, !dbg !28
|
200 |
+
%190 = zext nneg i32 %16 to i64, !dbg !28
|
201 |
+
%191 = getelementptr float, ptr addrspace(3) @global_smem, i64 %190, !dbg !28
|
202 |
+
%192 = insertelement <1 x float> undef, float %180, i64 0, !dbg !28
|
203 |
+
store <1 x float> %192, ptr addrspace(3) %191, align 4, !dbg !28
|
204 |
+
%193 = zext nneg i32 %17 to i64, !dbg !28
|
205 |
+
%194 = getelementptr float, ptr addrspace(3) @global_smem, i64 %193, !dbg !28
|
206 |
+
%195 = insertelement <1 x float> undef, float %183, i64 0, !dbg !28
|
207 |
+
store <1 x float> %195, ptr addrspace(3) %194, align 4, !dbg !28
|
208 |
+
%196 = zext nneg i32 %18 to i64, !dbg !28
|
209 |
+
%197 = getelementptr float, ptr addrspace(3) @global_smem, i64 %196, !dbg !28
|
210 |
+
%198 = insertelement <1 x float> undef, float %186, i64 0, !dbg !28
|
211 |
+
store <1 x float> %198, ptr addrspace(3) %197, align 4, !dbg !28
|
212 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
213 |
+
%199 = zext nneg i32 %7 to i64, !dbg !28
|
214 |
+
%200 = getelementptr float, ptr addrspace(3) @global_smem, i64 %199, !dbg !28
|
215 |
+
%201 = load <1 x float>, ptr addrspace(3) %200, align 4, !dbg !28
|
216 |
+
%.frozen = freeze i32 %22
|
217 |
+
%202 = sdiv i32 %.frozen, 256, !dbg !29
|
218 |
+
%203 = mul i32 %202, 256
|
219 |
+
%.decomposed = sub i32 %.frozen, %203
|
220 |
+
%204 = sext i32 %202 to i64, !dbg !30
|
221 |
+
%205 = getelementptr i64, ptr addrspace(1) %1, i64 %204, !dbg !30
|
222 |
+
%206 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %205, i1 true) #3, !dbg !31
|
223 |
+
%207 = lshr i64 %206, 54, !dbg !32
|
224 |
+
%208 = and i64 %207, 512, !dbg !32
|
225 |
+
%209 = add i64 %208, %206, !dbg !32
|
226 |
+
%210 = shl i64 %209, 8, !dbg !33
|
227 |
+
%211 = sext i32 %.decomposed to i64, !dbg !34
|
228 |
+
%212 = getelementptr float, ptr addrspace(1) %2, i64 %210, !dbg !35
|
229 |
+
%213 = getelementptr float, ptr addrspace(1) %212, i64 %211, !dbg !35
|
230 |
+
%214 = icmp eq i32 %11, 0, !dbg !36
|
231 |
+
%215 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %213, <1 x float> %201, i1 %214) #3, !dbg !36
|
232 |
+
ret void, !dbg !37
|
233 |
+
}
|
234 |
+
|
235 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
236 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
237 |
+
|
238 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
239 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
240 |
+
|
241 |
+
; Function Attrs: convergent nocallback nounwind
|
242 |
+
declare void @llvm.nvvm.barrier0() #2
|
243 |
+
|
244 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
245 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
246 |
+
attributes #2 = { convergent nocallback nounwind }
|
247 |
+
attributes #3 = { nounwind }
|
248 |
+
|
249 |
+
!llvm.module.flags = !{!0}
|
250 |
+
!llvm.dbg.cu = !{!1}
|
251 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
252 |
+
|
253 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
254 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
255 |
+
!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
|
256 |
+
!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
|
257 |
+
!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
|
258 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
259 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
260 |
+
!7 = !{}
|
261 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
262 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
263 |
+
!10 = !DILocation(line: 243, column: 36, scope: !11, inlinedAt: !13)
|
264 |
+
!11 = distinct !DILexicalBlockFile(scope: !5, file: !12, discriminator: 0)
|
265 |
+
!12 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
266 |
+
!13 = !DILocation(line: 35, column: 25, scope: !11)
|
267 |
+
!14 = !DILocation(line: 21, column: 28, scope: !5)
|
268 |
+
!15 = !DILocation(line: 21, column: 33, scope: !5)
|
269 |
+
!16 = !DILocation(line: 22, column: 23, scope: !5)
|
270 |
+
!17 = !DILocation(line: 29, column: 25, scope: !5)
|
271 |
+
!18 = !DILocation(line: 31, column: 47, scope: !5)
|
272 |
+
!19 = !DILocation(line: 31, column: 40, scope: !5)
|
273 |
+
!20 = !DILocation(line: 31, column: 34, scope: !5)
|
274 |
+
!21 = !DILocation(line: 31, column: 53, scope: !5)
|
275 |
+
!22 = !DILocation(line: 33, column: 23, scope: !5)
|
276 |
+
!23 = !DILocation(line: 34, column: 38, scope: !5)
|
277 |
+
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
|
278 |
+
!25 = distinct !DILexicalBlockFile(scope: !11, file: !12, discriminator: 0)
|
279 |
+
!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
|
280 |
+
!27 = !DILocation(line: 35, column: 25, scope: !25)
|
281 |
+
!28 = !DILocation(line: 35, column: 28, scope: !5)
|
282 |
+
!29 = !DILocation(line: 36, column: 20, scope: !5)
|
283 |
+
!30 = !DILocation(line: 38, column: 30, scope: !5)
|
284 |
+
!31 = !DILocation(line: 38, column: 35, scope: !5)
|
285 |
+
!32 = !DILocation(line: 41, column: 32, scope: !5)
|
286 |
+
!33 = !DILocation(line: 45, column: 40, scope: !5)
|
287 |
+
!34 = !DILocation(line: 45, column: 36, scope: !5)
|
288 |
+
!35 = !DILocation(line: 45, column: 30, scope: !5)
|
289 |
+
!36 = !DILocation(line: 45, column: 55, scope: !5)
|
290 |
+
!37 = !DILocation(line: 45, column: 4, scope: !5)
|
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx
ADDED
@@ -0,0 +1,653 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3de4e
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3de4e(
|
13 |
+
.param .u64 triton__0d1d2d3de4e_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3de4e_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3de4e_param_2,
|
16 |
+
.param .u32 triton__0d1d2d3de4e_param_3,
|
17 |
+
.param .u32 triton__0d1d2d3de4e_param_4
|
18 |
+
)
|
19 |
+
.maxntid 256, 1, 1
|
20 |
+
{
|
21 |
+
.reg .pred %p<30>;
|
22 |
+
.reg .b32 %r<112>;
|
23 |
+
.reg .f32 %f<76>;
|
24 |
+
.reg .b64 %rd<22>;
|
25 |
+
.loc 1 18 0
|
26 |
+
$L__func_begin0:
|
27 |
+
.loc 1 18 0
|
28 |
+
|
29 |
+
ld.param.u64 %rd8, [triton__0d1d2d3de4e_param_0];
|
30 |
+
ld.param.u64 %rd9, [triton__0d1d2d3de4e_param_1];
|
31 |
+
$L__tmp0:
|
32 |
+
.loc 1 22 44
|
33 |
+
mov.u32 %r48, %tid.x;
|
34 |
+
and.b32 %r49, %r48, 31;
|
35 |
+
ld.param.u64 %rd10, [triton__0d1d2d3de4e_param_2];
|
36 |
+
and.b32 %r50, %r48, 7;
|
37 |
+
shl.b32 %r51, %r50, 2;
|
38 |
+
.loc 1 24 33
|
39 |
+
bfe.u32 %r52, %r48, 5, 3;
|
40 |
+
bfe.u32 %r53, %r48, 3, 2;
|
41 |
+
shl.b32 %r54, %r52, 2;
|
42 |
+
or.b32 %r55, %r54, %r53;
|
43 |
+
or.b32 %r56, %r55, 96;
|
44 |
+
.loc 1 21 28
|
45 |
+
mov.u32 %r1, %ctaid.x;
|
46 |
+
.loc 1 21 33
|
47 |
+
shl.b32 %r57, %r1, 5;
|
48 |
+
.loc 1 22 23
|
49 |
+
or.b32 %r58, %r57, %r51;
|
50 |
+
or.b32 %r59, %r57, %r49;
|
51 |
+
.loc 1 29 25
|
52 |
+
setp.lt.u32 %p16, %r56, 120;
|
53 |
+
.loc 1 31 47
|
54 |
+
shl.b32 %r60, %r55, 17;
|
55 |
+
shl.b32 %r61, %r56, 17;
|
56 |
+
.loc 1 31 40
|
57 |
+
add.s32 %r62, %r58, %r60;
|
58 |
+
add.s32 %r63, %r62, 4194304;
|
59 |
+
add.s32 %r64, %r62, 8388608;
|
60 |
+
add.s32 %r65, %r58, %r61;
|
61 |
+
.loc 1 31 34
|
62 |
+
mul.wide.s32 %rd11, %r62, 4;
|
63 |
+
add.s64 %rd1, %rd8, %rd11;
|
64 |
+
mul.wide.s32 %rd12, %r63, 4;
|
65 |
+
add.s64 %rd2, %rd8, %rd12;
|
66 |
+
mul.wide.s32 %rd13, %r64, 4;
|
67 |
+
add.s64 %rd3, %rd8, %rd13;
|
68 |
+
mul.wide.s32 %rd14, %r65, 4;
|
69 |
+
add.s64 %rd4, %rd8, %rd14;
|
70 |
+
mov.b32 %r6, 0;
|
71 |
+
mov.pred %p1, -1;
|
72 |
+
.loc 1 31 53
|
73 |
+
mov.u32 %r2, 0x0;
|
74 |
+
mov.u32 %r3, 0x0;
|
75 |
+
mov.u32 %r4, 0x0;
|
76 |
+
mov.u32 %r5, 0x0;
|
77 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
78 |
+
@!%p1 mov.u32 %r2, %r6;
|
79 |
+
@!%p1 mov.u32 %r3, %r6;
|
80 |
+
@!%p1 mov.u32 %r4, %r6;
|
81 |
+
@!%p1 mov.u32 %r5, %r6;
|
82 |
+
mov.b32 %f1, %r2;
|
83 |
+
mov.b32 %f2, %r3;
|
84 |
+
mov.b32 %f3, %r4;
|
85 |
+
mov.b32 %f4, %r5;
|
86 |
+
mov.u32 %r10, 0x0;
|
87 |
+
mov.u32 %r11, 0x0;
|
88 |
+
mov.u32 %r12, 0x0;
|
89 |
+
mov.u32 %r13, 0x0;
|
90 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
|
91 |
+
@!%p1 mov.u32 %r10, %r6;
|
92 |
+
@!%p1 mov.u32 %r11, %r6;
|
93 |
+
@!%p1 mov.u32 %r12, %r6;
|
94 |
+
@!%p1 mov.u32 %r13, %r6;
|
95 |
+
mov.b32 %f5, %r10;
|
96 |
+
mov.b32 %f6, %r11;
|
97 |
+
mov.b32 %f7, %r12;
|
98 |
+
mov.b32 %f8, %r13;
|
99 |
+
mov.u32 %r18, 0x0;
|
100 |
+
mov.u32 %r19, 0x0;
|
101 |
+
mov.u32 %r20, 0x0;
|
102 |
+
mov.u32 %r21, 0x0;
|
103 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
|
104 |
+
@!%p1 mov.u32 %r18, %r6;
|
105 |
+
@!%p1 mov.u32 %r19, %r6;
|
106 |
+
@!%p1 mov.u32 %r20, %r6;
|
107 |
+
@!%p1 mov.u32 %r21, %r6;
|
108 |
+
mov.b32 %f9, %r18;
|
109 |
+
mov.b32 %f10, %r19;
|
110 |
+
mov.b32 %f11, %r20;
|
111 |
+
mov.b32 %f12, %r21;
|
112 |
+
mov.u32 %r26, 0x0;
|
113 |
+
mov.u32 %r27, 0x0;
|
114 |
+
mov.u32 %r28, 0x0;
|
115 |
+
mov.u32 %r29, 0x0;
|
116 |
+
@%p16 ld.global.L1::evict_first.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
|
117 |
+
@!%p16 mov.u32 %r26, %r6;
|
118 |
+
@!%p16 mov.u32 %r27, %r6;
|
119 |
+
@!%p16 mov.u32 %r28, %r6;
|
120 |
+
@!%p16 mov.u32 %r29, %r6;
|
121 |
+
mov.b32 %f13, %r26;
|
122 |
+
mov.b32 %f14, %r27;
|
123 |
+
mov.b32 %f15, %r28;
|
124 |
+
mov.b32 %f16, %r29;
|
125 |
+
.loc 1 33 23
|
126 |
+
add.f32 %f17, %f1, 0f00000000;
|
127 |
+
add.f32 %f18, %f2, 0f00000000;
|
128 |
+
add.f32 %f19, %f3, 0f00000000;
|
129 |
+
add.f32 %f20, %f4, 0f00000000;
|
130 |
+
add.f32 %f21, %f5, 0f00000000;
|
131 |
+
add.f32 %f22, %f6, 0f00000000;
|
132 |
+
add.f32 %f23, %f7, 0f00000000;
|
133 |
+
add.f32 %f24, %f8, 0f00000000;
|
134 |
+
add.f32 %f25, %f9, 0f00000000;
|
135 |
+
add.f32 %f26, %f10, 0f00000000;
|
136 |
+
add.f32 %f27, %f11, 0f00000000;
|
137 |
+
add.f32 %f28, %f12, 0f00000000;
|
138 |
+
add.f32 %f29, %f13, 0f00000000;
|
139 |
+
add.f32 %f30, %f14, 0f00000000;
|
140 |
+
add.f32 %f31, %f15, 0f00000000;
|
141 |
+
add.f32 %f32, %f16, 0f00000000;
|
142 |
+
.loc 1 34 38
|
143 |
+
selp.f32 %f33, %f29, 0f00000000, %p16;
|
144 |
+
selp.f32 %f34, %f30, 0f00000000, %p16;
|
145 |
+
selp.f32 %f35, %f31, 0f00000000, %p16;
|
146 |
+
selp.f32 %f36, %f32, 0f00000000, %p16;
|
147 |
+
$L__tmp1:
|
148 |
+
.loc 2 233 15
|
149 |
+
add.f32 %f37, %f17, %f21;
|
150 |
+
add.f32 %f38, %f18, %f22;
|
151 |
+
add.f32 %f39, %f19, %f23;
|
152 |
+
add.f32 %f40, %f20, %f24;
|
153 |
+
add.f32 %f41, %f37, %f25;
|
154 |
+
add.f32 %f42, %f38, %f26;
|
155 |
+
add.f32 %f43, %f39, %f27;
|
156 |
+
add.f32 %f44, %f40, %f28;
|
157 |
+
add.f32 %f45, %f41, %f33;
|
158 |
+
add.f32 %f46, %f42, %f34;
|
159 |
+
add.f32 %f47, %f43, %f35;
|
160 |
+
add.f32 %f48, %f44, %f36;
|
161 |
+
$L__tmp2:
|
162 |
+
.loc 2 243 36
|
163 |
+
mov.b32 %r66, %f45;
|
164 |
+
shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
|
165 |
+
mov.b32 %f49, %r67;
|
166 |
+
$L__tmp3:
|
167 |
+
.loc 2 233 15
|
168 |
+
add.f32 %f50, %f45, %f49;
|
169 |
+
$L__tmp4:
|
170 |
+
.loc 2 243 36
|
171 |
+
mov.b32 %r68, %f50;
|
172 |
+
shfl.sync.bfly.b32 %r69, %r68, 8, 31, -1;
|
173 |
+
mov.b32 %f51, %r69;
|
174 |
+
$L__tmp5:
|
175 |
+
.loc 2 233 15
|
176 |
+
add.f32 %f52, %f50, %f51;
|
177 |
+
$L__tmp6:
|
178 |
+
.loc 2 243 36
|
179 |
+
mov.b32 %r70, %f46;
|
180 |
+
shfl.sync.bfly.b32 %r71, %r70, 16, 31, -1;
|
181 |
+
mov.b32 %f53, %r71;
|
182 |
+
$L__tmp7:
|
183 |
+
.loc 2 233 15
|
184 |
+
add.f32 %f54, %f46, %f53;
|
185 |
+
$L__tmp8:
|
186 |
+
.loc 2 243 36
|
187 |
+
mov.b32 %r72, %f54;
|
188 |
+
shfl.sync.bfly.b32 %r73, %r72, 8, 31, -1;
|
189 |
+
mov.b32 %f55, %r73;
|
190 |
+
$L__tmp9:
|
191 |
+
.loc 2 233 15
|
192 |
+
add.f32 %f56, %f54, %f55;
|
193 |
+
$L__tmp10:
|
194 |
+
.loc 2 243 36
|
195 |
+
mov.b32 %r74, %f47;
|
196 |
+
shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1;
|
197 |
+
mov.b32 %f57, %r75;
|
198 |
+
$L__tmp11:
|
199 |
+
.loc 2 233 15
|
200 |
+
add.f32 %f58, %f47, %f57;
|
201 |
+
$L__tmp12:
|
202 |
+
.loc 2 243 36
|
203 |
+
mov.b32 %r76, %f58;
|
204 |
+
shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1;
|
205 |
+
mov.b32 %f59, %r77;
|
206 |
+
$L__tmp13:
|
207 |
+
.loc 2 233 15
|
208 |
+
add.f32 %f60, %f58, %f59;
|
209 |
+
$L__tmp14:
|
210 |
+
.loc 2 243 36
|
211 |
+
mov.b32 %r78, %f48;
|
212 |
+
shfl.sync.bfly.b32 %r79, %r78, 16, 31, -1;
|
213 |
+
mov.b32 %f61, %r79;
|
214 |
+
$L__tmp15:
|
215 |
+
.loc 2 233 15
|
216 |
+
add.f32 %f62, %f48, %f61;
|
217 |
+
$L__tmp16:
|
218 |
+
.loc 2 243 36
|
219 |
+
mov.b32 %r80, %f62;
|
220 |
+
shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1;
|
221 |
+
mov.b32 %f63, %r81;
|
222 |
+
$L__tmp17:
|
223 |
+
.loc 2 233 15
|
224 |
+
add.f32 %f64, %f62, %f63;
|
225 |
+
$L__tmp18:
|
226 |
+
.loc 2 243 36
|
227 |
+
setp.lt.u32 %p21, %r49, 8;
|
228 |
+
shl.b32 %r82, %r50, 7;
|
229 |
+
or.b32 %r83, %r82, %r54;
|
230 |
+
mov.u32 %r84, global_smem;
|
231 |
+
add.s32 %r34, %r84, %r83;
|
232 |
+
mov.b32 %r35, %f52;
|
233 |
+
@%p21 st.shared.b32 [ %r34 + 0 ], %r35;
|
234 |
+
or.b32 %r85, %r82, 32;
|
235 |
+
or.b32 %r86, %r85, %r54;
|
236 |
+
add.s32 %r36, %r84, %r86;
|
237 |
+
mov.b32 %r37, %f56;
|
238 |
+
@%p21 st.shared.b32 [ %r36 + 0 ], %r37;
|
239 |
+
or.b32 %r87, %r82, 64;
|
240 |
+
or.b32 %r88, %r87, %r54;
|
241 |
+
add.s32 %r38, %r84, %r88;
|
242 |
+
mov.b32 %r39, %f60;
|
243 |
+
@%p21 st.shared.b32 [ %r38 + 0 ], %r39;
|
244 |
+
or.b32 %r89, %r82, 96;
|
245 |
+
or.b32 %r90, %r89, %r54;
|
246 |
+
add.s32 %r40, %r84, %r90;
|
247 |
+
mov.b32 %r41, %f64;
|
248 |
+
@%p21 st.shared.b32 [ %r40 + 0 ], %r41;
|
249 |
+
bar.sync 0;
|
250 |
+
setp.lt.s32 %p25, %r48, 256;
|
251 |
+
shl.b32 %r91, %r48, 2;
|
252 |
+
add.s32 %r43, %r84, %r91;
|
253 |
+
@%p25 ld.shared.b32 %r42, [ %r43 + 0 ];
|
254 |
+
mov.b32 %f65, %r42;
|
255 |
+
shfl.sync.bfly.b32 %r92, %r42, 4, 31, -1;
|
256 |
+
mov.b32 %f66, %r92;
|
257 |
+
$L__tmp19:
|
258 |
+
.loc 2 233 15
|
259 |
+
add.f32 %f67, %f65, %f66;
|
260 |
+
$L__tmp20:
|
261 |
+
.loc 2 243 36
|
262 |
+
mov.b32 %r93, %f67;
|
263 |
+
shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
|
264 |
+
mov.b32 %f68, %r94;
|
265 |
+
$L__tmp21:
|
266 |
+
.loc 2 233 15
|
267 |
+
add.f32 %f69, %f67, %f68;
|
268 |
+
$L__tmp22:
|
269 |
+
.loc 2 243 36
|
270 |
+
mov.b32 %r95, %f69;
|
271 |
+
shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
|
272 |
+
mov.b32 %f70, %r96;
|
273 |
+
$L__tmp23:
|
274 |
+
.loc 2 233 15
|
275 |
+
add.f32 %f71, %f69, %f70;
|
276 |
+
$L__tmp24:
|
277 |
+
.loc 2 243 36
|
278 |
+
setp.eq.s32 %p29, %r50, 0;
|
279 |
+
and.pred %p26, %p25, %p29;
|
280 |
+
mov.b32 %r45, %f71;
|
281 |
+
@%p26 st.shared.b32 [ %r43 + 0 ], %r45;
|
282 |
+
bar.sync 0;
|
283 |
+
add.s32 %r97, %r84, %r82;
|
284 |
+
ld.shared.f32 %f72, [%r97];
|
285 |
+
add.s32 %r98, %r84, %r85;
|
286 |
+
ld.shared.f32 %f73, [%r98];
|
287 |
+
add.s32 %r99, %r84, %r87;
|
288 |
+
ld.shared.f32 %f74, [%r99];
|
289 |
+
add.s32 %r100, %r84, %r89;
|
290 |
+
ld.shared.f32 %f75, [%r100];
|
291 |
+
$L__tmp25:
|
292 |
+
.loc 1 35 28
|
293 |
+
bar.sync 0;
|
294 |
+
shl.b32 %r101, %r50, 4;
|
295 |
+
add.s32 %r102, %r84, %r101;
|
296 |
+
st.shared.f32 [%r102], %f72;
|
297 |
+
st.shared.f32 [%r102+4], %f73;
|
298 |
+
st.shared.f32 [%r102+8], %f74;
|
299 |
+
st.shared.f32 [%r102+12], %f75;
|
300 |
+
bar.sync 0;
|
301 |
+
shl.b32 %r103, %r49, 2;
|
302 |
+
add.s32 %r104, %r84, %r103;
|
303 |
+
.loc 1 36 20
|
304 |
+
shr.s32 %r106, %r59, 31;
|
305 |
+
shr.u32 %r107, %r106, 24;
|
306 |
+
add.s32 %r108, %r59, %r107;
|
307 |
+
shr.s32 %r109, %r108, 8;
|
308 |
+
and.b32 %r110, %r108, -256;
|
309 |
+
sub.s32 %r111, %r59, %r110;
|
310 |
+
.loc 1 38 30
|
311 |
+
mul.wide.s32 %rd15, %r109, 8;
|
312 |
+
add.s64 %rd6, %rd9, %rd15;
|
313 |
+
.loc 1 45 55
|
314 |
+
ld.shared.u32 %r47, [%r104];
|
315 |
+
.loc 1 38 35
|
316 |
+
mov.u64 %rd5, 0x0;
|
317 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd5 }, [ %rd6 + 0 ];
|
318 |
+
.loc 1 41 32
|
319 |
+
shr.u64 %rd16, %rd5, 54;
|
320 |
+
and.b64 %rd17, %rd16, 512;
|
321 |
+
add.s64 %rd18, %rd17, %rd5;
|
322 |
+
.loc 1 45 30
|
323 |
+
shl.b64 %rd19, %rd18, 10;
|
324 |
+
add.s64 %rd20, %rd10, %rd19;
|
325 |
+
mul.wide.s32 %rd21, %r111, 4;
|
326 |
+
add.s64 %rd7, %rd20, %rd21;
|
327 |
+
.loc 1 45 55
|
328 |
+
setp.eq.s32 %p28, %r52, 0;
|
329 |
+
mov.u32 %r46, 0x0;
|
330 |
+
@%p28 atom.global.gpu.acq_rel.add.f32 %r46, [ %rd7 + 0 ], %r47;
|
331 |
+
.loc 1 45 4
|
332 |
+
ret;
|
333 |
+
$L__tmp26:
|
334 |
+
$L__func_end0:
|
335 |
+
|
336 |
+
}
|
337 |
+
.file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
|
338 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
339 |
+
.section .debug_abbrev
|
340 |
+
{
|
341 |
+
.b8 1
|
342 |
+
.b8 17
|
343 |
+
.b8 1
|
344 |
+
.b8 37
|
345 |
+
.b8 8
|
346 |
+
.b8 19
|
347 |
+
.b8 5
|
348 |
+
.b8 3
|
349 |
+
.b8 8
|
350 |
+
.b8 16
|
351 |
+
.b8 6
|
352 |
+
.b8 27
|
353 |
+
.b8 8
|
354 |
+
.b8 180
|
355 |
+
.b8 66
|
356 |
+
.b8 12
|
357 |
+
.b8 17
|
358 |
+
.b8 1
|
359 |
+
.b8 18
|
360 |
+
.b8 1
|
361 |
+
.b8 0
|
362 |
+
.b8 0
|
363 |
+
.b8 2
|
364 |
+
.b8 46
|
365 |
+
.b8 0
|
366 |
+
.b8 135
|
367 |
+
.b8 64
|
368 |
+
.b8 8
|
369 |
+
.b8 3
|
370 |
+
.b8 8
|
371 |
+
.b8 58
|
372 |
+
.b8 11
|
373 |
+
.b8 59
|
374 |
+
.b8 11
|
375 |
+
.b8 63
|
376 |
+
.b8 12
|
377 |
+
.b8 32
|
378 |
+
.b8 11
|
379 |
+
.b8 0
|
380 |
+
.b8 0
|
381 |
+
.b8 3
|
382 |
+
.b8 46
|
383 |
+
.b8 1
|
384 |
+
.b8 17
|
385 |
+
.b8 1
|
386 |
+
.b8 18
|
387 |
+
.b8 1
|
388 |
+
.b8 64
|
389 |
+
.b8 10
|
390 |
+
.b8 49
|
391 |
+
.b8 19
|
392 |
+
.b8 0
|
393 |
+
.b8 0
|
394 |
+
.b8 4
|
395 |
+
.b8 29
|
396 |
+
.b8 1
|
397 |
+
.b8 49
|
398 |
+
.b8 19
|
399 |
+
.b8 17
|
400 |
+
.b8 1
|
401 |
+
.b8 18
|
402 |
+
.b8 1
|
403 |
+
.b8 88
|
404 |
+
.b8 11
|
405 |
+
.b8 89
|
406 |
+
.b8 11
|
407 |
+
.b8 87
|
408 |
+
.b8 11
|
409 |
+
.b8 0
|
410 |
+
.b8 0
|
411 |
+
.b8 5
|
412 |
+
.b8 29
|
413 |
+
.b8 0
|
414 |
+
.b8 49
|
415 |
+
.b8 19
|
416 |
+
.b8 17
|
417 |
+
.b8 1
|
418 |
+
.b8 18
|
419 |
+
.b8 1
|
420 |
+
.b8 88
|
421 |
+
.b8 11
|
422 |
+
.b8 89
|
423 |
+
.b8 11
|
424 |
+
.b8 87
|
425 |
+
.b8 11
|
426 |
+
.b8 0
|
427 |
+
.b8 0
|
428 |
+
.b8 0
|
429 |
+
}
|
430 |
+
.section .debug_info
|
431 |
+
{
|
432 |
+
.b32 264
|
433 |
+
.b8 2
|
434 |
+
.b8 0
|
435 |
+
.b32 .debug_abbrev
|
436 |
+
.b8 8
|
437 |
+
.b8 1
|
438 |
+
.b8 116
|
439 |
+
.b8 114
|
440 |
+
.b8 105
|
441 |
+
.b8 116
|
442 |
+
.b8 111
|
443 |
+
.b8 110
|
444 |
+
.b8 0
|
445 |
+
.b8 2
|
446 |
+
.b8 0
|
447 |
+
.b8 99
|
448 |
+
.b8 54
|
449 |
+
.b8 105
|
450 |
+
.b8 107
|
451 |
+
.b8 53
|
452 |
+
.b8 118
|
453 |
+
.b8 120
|
454 |
+
.b8 55
|
455 |
+
.b8 112
|
456 |
+
.b8 50
|
457 |
+
.b8 50
|
458 |
+
.b8 102
|
459 |
+
.b8 112
|
460 |
+
.b8 107
|
461 |
+
.b8 52
|
462 |
+
.b8 100
|
463 |
+
.b8 99
|
464 |
+
.b8 118
|
465 |
+
.b8 104
|
466 |
+
.b8 53
|
467 |
+
.b8 53
|
468 |
+
.b8 122
|
469 |
+
.b8 105
|
470 |
+
.b8 109
|
471 |
+
.b8 119
|
472 |
+
.b8 52
|
473 |
+
.b8 116
|
474 |
+
.b8 53
|
475 |
+
.b8 110
|
476 |
+
.b8 114
|
477 |
+
.b8 53
|
478 |
+
.b8 122
|
479 |
+
.b8 110
|
480 |
+
.b8 50
|
481 |
+
.b8 98
|
482 |
+
.b8 55
|
483 |
+
.b8 105
|
484 |
+
.b8 110
|
485 |
+
.b8 117
|
486 |
+
.b8 106
|
487 |
+
.b8 120
|
488 |
+
.b8 106
|
489 |
+
.b8 97
|
490 |
+
.b8 117
|
491 |
+
.b8 120
|
492 |
+
.b8 115
|
493 |
+
.b8 104
|
494 |
+
.b8 108
|
495 |
+
.b8 106
|
496 |
+
.b8 117
|
497 |
+
.b8 109
|
498 |
+
.b8 109
|
499 |
+
.b8 46
|
500 |
+
.b8 112
|
501 |
+
.b8 121
|
502 |
+
.b8 0
|
503 |
+
.b32 .debug_line
|
504 |
+
.b8 47
|
505 |
+
.b8 116
|
506 |
+
.b8 109
|
507 |
+
.b8 112
|
508 |
+
.b8 47
|
509 |
+
.b8 116
|
510 |
+
.b8 111
|
511 |
+
.b8 114
|
512 |
+
.b8 99
|
513 |
+
.b8 104
|
514 |
+
.b8 105
|
515 |
+
.b8 110
|
516 |
+
.b8 100
|
517 |
+
.b8 117
|
518 |
+
.b8 99
|
519 |
+
.b8 116
|
520 |
+
.b8 111
|
521 |
+
.b8 114
|
522 |
+
.b8 95
|
523 |
+
.b8 114
|
524 |
+
.b8 111
|
525 |
+
.b8 111
|
526 |
+
.b8 116
|
527 |
+
.b8 47
|
528 |
+
.b8 54
|
529 |
+
.b8 105
|
530 |
+
.b8 0
|
531 |
+
.b8 1
|
532 |
+
.b64 $L__func_begin0
|
533 |
+
.b64 $L__func_end0
|
534 |
+
.b8 2
|
535 |
+
.b8 116
|
536 |
+
.b8 114
|
537 |
+
.b8 105
|
538 |
+
.b8 116
|
539 |
+
.b8 111
|
540 |
+
.b8 110
|
541 |
+
.b8 95
|
542 |
+
.b8 95
|
543 |
+
.b8 48
|
544 |
+
.b8 100
|
545 |
+
.b8 49
|
546 |
+
.b8 100
|
547 |
+
.b8 50
|
548 |
+
.b8 100
|
549 |
+
.b8 51
|
550 |
+
.b8 100
|
551 |
+
.b8 101
|
552 |
+
.b8 52
|
553 |
+
.b8 101
|
554 |
+
.b8 0
|
555 |
+
.b8 116
|
556 |
+
.b8 114
|
557 |
+
.b8 105
|
558 |
+
.b8 116
|
559 |
+
.b8 111
|
560 |
+
.b8 110
|
561 |
+
.b8 95
|
562 |
+
.b8 95
|
563 |
+
.b8 48
|
564 |
+
.b8 100
|
565 |
+
.b8 49
|
566 |
+
.b8 100
|
567 |
+
.b8 50
|
568 |
+
.b8 100
|
569 |
+
.b8 51
|
570 |
+
.b8 100
|
571 |
+
.b8 101
|
572 |
+
.b8 52
|
573 |
+
.b8 101
|
574 |
+
.b8 0
|
575 |
+
.b8 1
|
576 |
+
.b8 18
|
577 |
+
.b8 1
|
578 |
+
.b8 1
|
579 |
+
.b8 3
|
580 |
+
.b64 $L__func_begin0
|
581 |
+
.b64 $L__func_end0
|
582 |
+
.b8 1
|
583 |
+
.b8 156
|
584 |
+
.b32 125
|
585 |
+
.b8 4
|
586 |
+
.b32 125
|
587 |
+
.b64 $L__tmp1
|
588 |
+
.b64 $L__tmp24
|
589 |
+
.b8 2
|
590 |
+
.b8 35
|
591 |
+
.b8 25
|
592 |
+
.b8 5
|
593 |
+
.b32 125
|
594 |
+
.b64 $L__tmp1
|
595 |
+
.b64 $L__tmp24
|
596 |
+
.b8 2
|
597 |
+
.b8 243
|
598 |
+
.b8 36
|
599 |
+
.b8 0
|
600 |
+
.b8 5
|
601 |
+
.b32 125
|
602 |
+
.b64 $L__tmp2
|
603 |
+
.b64 $L__tmp25
|
604 |
+
.b8 2
|
605 |
+
.b8 35
|
606 |
+
.b8 25
|
607 |
+
.b8 0
|
608 |
+
.b8 0
|
609 |
+
}
|
610 |
+
.section .debug_pubnames
|
611 |
+
{
|
612 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
613 |
+
$L__pubNames_start0:
|
614 |
+
.b8 2
|
615 |
+
.b8 0
|
616 |
+
.b32 .debug_info
|
617 |
+
.b32 268
|
618 |
+
.b32 125
|
619 |
+
.b8 116
|
620 |
+
.b8 114
|
621 |
+
.b8 105
|
622 |
+
.b8 116
|
623 |
+
.b8 111
|
624 |
+
.b8 110
|
625 |
+
.b8 95
|
626 |
+
.b8 95
|
627 |
+
.b8 48
|
628 |
+
.b8 100
|
629 |
+
.b8 49
|
630 |
+
.b8 100
|
631 |
+
.b8 50
|
632 |
+
.b8 100
|
633 |
+
.b8 51
|
634 |
+
.b8 100
|
635 |
+
.b8 101
|
636 |
+
.b8 52
|
637 |
+
.b8 101
|
638 |
+
.b8 0
|
639 |
+
.b32 0
|
640 |
+
$L__pubNames_end0:
|
641 |
+
}
|
642 |
+
.section .debug_pubtypes
|
643 |
+
{
|
644 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
645 |
+
$L__pubTypes_start0:
|
646 |
+
.b8 2
|
647 |
+
.b8 0
|
648 |
+
.b32 .debug_info
|
649 |
+
.b32 268
|
650 |
+
.b32 0
|
651 |
+
$L__pubTypes_end0:
|
652 |
+
}
|
653 |
+
.section .debug_loc { }
|
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<32x1xi64, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<0> : tensor<32x1xi64, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<512> : tensor<32x1xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<256> : tensor<32x1xi32, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
|
10 |
+
%cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
|
11 |
+
%cst_5 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked1>
|
12 |
+
%cst_6 = arith.constant dense<true> : tensor<32x1xi1, #blocked>
|
13 |
+
%c32_i32 = arith.constant 32 : i32
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.muli %0, %c32_i32 : i32
|
16 |
+
%2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
17 |
+
%3 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
18 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32x1xi32, #blocked1>
|
19 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xi32, #blocked>
|
20 |
+
%6 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked1>
|
21 |
+
%7 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked>
|
22 |
+
%8 = arith.addi %6, %4 : tensor<32x1xi32, #blocked1>
|
23 |
+
%9 = arith.addi %7, %5 : tensor<32x1xi32, #blocked>
|
24 |
+
%10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
|
25 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
|
26 |
+
%12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
|
27 |
+
%13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
|
28 |
+
%14 = tt.broadcast %8 : (tensor<32x1xi32, #blocked1>) -> tensor<32x128xi32, #blocked1>
|
29 |
+
%15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<32x128xi32, #blocked1>
|
30 |
+
%16 = arith.addi %14, %15 : tensor<32x128xi32, #blocked1>
|
31 |
+
%17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<32x128x!tt.ptr<f32, 1>, #blocked1>
|
32 |
+
%18 = tt.addptr %17, %16 : tensor<32x128x!tt.ptr<f32, 1>, #blocked1>, tensor<32x128xi32, #blocked1>
|
33 |
+
%19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<32x128xi1, #blocked1>
|
34 |
+
%20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32, #blocked1>
|
35 |
+
%21 = arith.addf %20, %cst_5 : tensor<32x128xf32, #blocked1>
|
36 |
+
%22 = arith.select %19, %21, %cst_5 : tensor<32x128xi1, #blocked1>, tensor<32x128xf32, #blocked1>
|
37 |
+
%23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
|
38 |
+
^bb0(%arg5: f32, %arg6: f32):
|
39 |
+
%40 = arith.addf %arg5, %arg6 : f32
|
40 |
+
tt.reduce.return %40 : f32
|
41 |
+
}) : (tensor<32x128xf32, #blocked1>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
42 |
+
%24 = triton_gpu.convert_layout %23 : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
43 |
+
%25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xf32, #blocked>
|
44 |
+
%26 = arith.divsi %9, %cst_2 : tensor<32x1xi32, #blocked>
|
45 |
+
%27 = arith.remsi %9, %cst_2 : tensor<32x1xi32, #blocked>
|
46 |
+
%28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<32x1x!tt.ptr<i64, 1>, #blocked>
|
47 |
+
%29 = tt.addptr %28, %26 : tensor<32x1x!tt.ptr<i64, 1>, #blocked>, tensor<32x1xi32, #blocked>
|
48 |
+
%30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64, #blocked>
|
49 |
+
%31 = arith.addi %30, %cst_1 : tensor<32x1xi64, #blocked>
|
50 |
+
%32 = arith.cmpi slt, %30, %cst_0 : tensor<32x1xi64, #blocked>
|
51 |
+
%33 = arith.select %32, %31, %30 : tensor<32x1xi1, #blocked>, tensor<32x1xi64, #blocked>
|
52 |
+
%34 = arith.muli %33, %cst : tensor<32x1xi64, #blocked>
|
53 |
+
%35 = arith.extsi %27 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked>
|
54 |
+
%36 = arith.addi %35, %34 : tensor<32x1xi64, #blocked>
|
55 |
+
%37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<32x1x!tt.ptr<f32, 1>, #blocked>
|
56 |
+
%38 = tt.addptr %37, %36 : tensor<32x1x!tt.ptr<f32, 1>, #blocked>, tensor<32x1xi64, #blocked>
|
57 |
+
%39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr<f32, 1>, #blocked>, tensor<32x1xf32, #blocked>, tensor<32x1xi1, #blocked>) -> tensor<32x1xf32, #blocked>
|
58 |
+
tt.return
|
59 |
+
}
|
60 |
+
}
|
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<256> : tensor<32x1xi64>
|
4 |
+
%cst_0 = arith.constant dense<0> : tensor<32x1xi64>
|
5 |
+
%cst_1 = arith.constant dense<512> : tensor<32x1xi64>
|
6 |
+
%cst_2 = arith.constant dense<true> : tensor<32x1xi1>
|
7 |
+
%cst_3 = arith.constant dense<256> : tensor<32x1xi32>
|
8 |
+
%cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
|
9 |
+
%cst_5 = arith.constant dense<120> : tensor<1x128xi32>
|
10 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<32x128xf32>
|
11 |
+
%c32_i32 = arith.constant 32 : i32
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = arith.muli %0, %c32_i32 : i32
|
14 |
+
%2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
|
15 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32>) -> tensor<32x1xi32>
|
16 |
+
%4 = tt.splat %1 : (i32) -> tensor<32x1xi32>
|
17 |
+
%5 = arith.addi %4, %3 : tensor<32x1xi32>
|
18 |
+
%6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
|
19 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
|
20 |
+
%8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
|
21 |
+
%9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
|
22 |
+
%10 = tt.broadcast %5 : (tensor<32x1xi32>) -> tensor<32x128xi32>
|
23 |
+
%11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<32x128xi32>
|
24 |
+
%12 = arith.addi %10, %11 : tensor<32x128xi32>
|
25 |
+
%13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<32x128x!tt.ptr<f32, 1>>
|
26 |
+
%14 = tt.addptr %13, %12 : tensor<32x128x!tt.ptr<f32, 1>>, tensor<32x128xi32>
|
27 |
+
%15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<32x128xi1>
|
28 |
+
%16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32>
|
29 |
+
%17 = arith.addf %16, %cst_6 : tensor<32x128xf32>
|
30 |
+
%18 = arith.select %15, %17, %cst_6 : tensor<32x128xi1>, tensor<32x128xf32>
|
31 |
+
%19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
|
32 |
+
^bb0(%arg5: f32, %arg6: f32):
|
33 |
+
%35 = arith.addf %arg5, %arg6 : f32
|
34 |
+
tt.reduce.return %35 : f32
|
35 |
+
}) : (tensor<32x128xf32>) -> tensor<32xf32>
|
36 |
+
%20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<32xf32>) -> tensor<32x1xf32>
|
37 |
+
%21 = arith.divsi %5, %cst_3 : tensor<32x1xi32>
|
38 |
+
%22 = arith.remsi %5, %cst_3 : tensor<32x1xi32>
|
39 |
+
%23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<32x1x!tt.ptr<i64, 1>>
|
40 |
+
%24 = tt.addptr %23, %21 : tensor<32x1x!tt.ptr<i64, 1>>, tensor<32x1xi32>
|
41 |
+
%25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64>
|
42 |
+
%26 = arith.addi %25, %cst_1 : tensor<32x1xi64>
|
43 |
+
%27 = arith.cmpi slt, %25, %cst_0 : tensor<32x1xi64>
|
44 |
+
%28 = arith.select %27, %26, %25 : tensor<32x1xi1>, tensor<32x1xi64>
|
45 |
+
%29 = arith.muli %28, %cst : tensor<32x1xi64>
|
46 |
+
%30 = arith.extsi %22 : tensor<32x1xi32> to tensor<32x1xi64>
|
47 |
+
%31 = arith.addi %30, %29 : tensor<32x1xi64>
|
48 |
+
%32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<32x1x!tt.ptr<f32, 1>>
|
49 |
+
%33 = tt.addptr %32, %31 : tensor<32x1x!tt.ptr<f32, 1>>, tensor<32x1xi64>
|
50 |
+
%34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr<f32, 1>>, tensor<32x1xf32>, tensor<32x1xi1>) -> tensor<32x1xf32>
|
51 |
+
tt.return
|
52 |
+
}
|
53 |
+
}
|
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%c1024_i32 = arith.constant 1024 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
9 |
+
%3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
|
10 |
+
%4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
11 |
+
%5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
|
12 |
+
%6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
|
13 |
+
%7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
|
14 |
+
%8 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
15 |
+
%9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
16 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
17 |
+
%11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
|
18 |
+
%12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
|
19 |
+
%13 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
|
20 |
+
%14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
|
21 |
+
tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
|
22 |
+
tt.return
|
23 |
+
}
|
24 |
+
}
|
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin
ADDED
Binary file (14.1 kB). View file
|
|
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
|
9 |
+
%c0_i32 = arith.constant 0 : i32
|
10 |
+
%c128_i32 = arith.constant 128 : i32
|
11 |
+
%c8_i32 = arith.constant 8 : i32
|
12 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
|
13 |
+
%c64_i32 = arith.constant 64 : i32
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
16 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
17 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
18 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
19 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
20 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
21 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
22 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
|
23 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
|
24 |
+
%10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
25 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
|
26 |
+
%12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
|
27 |
+
%13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked>
|
28 |
+
%14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
29 |
+
%15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked>
|
30 |
+
%16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
31 |
+
%17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
32 |
+
%18 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
33 |
+
%19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 {
|
34 |
+
%25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked>
|
35 |
+
%26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked>
|
36 |
+
%27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked>
|
37 |
+
%28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked>
|
38 |
+
%29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
39 |
+
%30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked>
|
40 |
+
%31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked>
|
41 |
+
%32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
42 |
+
%33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
|
43 |
+
%34 = tt.load %32, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
44 |
+
%35 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
45 |
+
%36 = tt.load %35, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
46 |
+
%37 = arith.mulf %34, %36 : tensor<64x8xf32, #blocked>
|
47 |
+
%38 = arith.addf %arg6, %37 : tensor<64x8xf32, #blocked>
|
48 |
+
%39 = arith.select %33, %38, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
|
49 |
+
scf.yield %39 : tensor<64x8xf32, #blocked>
|
50 |
+
}
|
51 |
+
%20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({
|
52 |
+
^bb0(%arg5: f32, %arg6: f32):
|
53 |
+
%25 = arith.addf %arg5, %arg6 : f32
|
54 |
+
tt.reduce.return %25 : f32
|
55 |
+
}) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
56 |
+
%21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
57 |
+
%22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1>
|
58 |
+
%23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked1>
|
59 |
+
%24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr<f32, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
|
60 |
+
tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1>
|
61 |
+
tt.return
|
62 |
+
}
|
63 |
+
}
|
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx
ADDED
@@ -0,0 +1,577 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3de4de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3de4de(
|
13 |
+
.param .u64 triton__0d1d2d3de4de_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3de4de_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3de4de_param_2,
|
16 |
+
.param .u32 triton__0d1d2d3de4de_param_3,
|
17 |
+
.param .u32 triton__0d1d2d3de4de_param_4
|
18 |
+
)
|
19 |
+
.maxntid 128, 1, 1
|
20 |
+
{
|
21 |
+
.reg .pred %p<20>;
|
22 |
+
.reg .b16 %rs<5>;
|
23 |
+
.reg .b32 %r<98>;
|
24 |
+
.reg .f32 %f<47>;
|
25 |
+
.reg .b64 %rd<10>;
|
26 |
+
.loc 1 18 0
|
27 |
+
$L__func_begin0:
|
28 |
+
.loc 1 18 0
|
29 |
+
|
30 |
+
ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2];
|
31 |
+
ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1];
|
32 |
+
ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0];
|
33 |
+
$L__tmp0:
|
34 |
+
.loc 1 22 44
|
35 |
+
mov.u32 %r1, %tid.x;
|
36 |
+
and.b32 %r2, %r1, 31;
|
37 |
+
shl.b32 %r13, %r1, 2;
|
38 |
+
and.b32 %r3, %r13, 60;
|
39 |
+
.loc 1 24 33
|
40 |
+
bfe.u32 %r4, %r1, 5, 2;
|
41 |
+
.loc 1 21 28
|
42 |
+
mov.u32 %r11, %ctaid.x;
|
43 |
+
.loc 1 21 33
|
44 |
+
shl.b32 %r5, %r11, 6;
|
45 |
+
.loc 1 22 23
|
46 |
+
or.b32 %r14, %r5, %r3;
|
47 |
+
.loc 1 26 20
|
48 |
+
shr.s32 %r16, %r14, 31;
|
49 |
+
shr.u32 %r17, %r16, 24;
|
50 |
+
add.s32 %r18, %r14, %r17;
|
51 |
+
shr.s32 %r19, %r18, 8;
|
52 |
+
.loc 1 29 36
|
53 |
+
mad.lo.s32 %r20, %r19, 32512, %r14;
|
54 |
+
shl.b32 %r21, %r4, 9;
|
55 |
+
add.s32 %r22, %r20, %r21;
|
56 |
+
shl.b32 %r23, %r1, 4;
|
57 |
+
and.b32 %r24, %r23, 256;
|
58 |
+
add.s32 %r96, %r22, %r24;
|
59 |
+
mov.f32 %f43, 0f00000000;
|
60 |
+
mov.b32 %r97, -8;
|
61 |
+
mov.pred %p1, -1;
|
62 |
+
mov.f32 %f44, %f43;
|
63 |
+
mov.f32 %f45, %f43;
|
64 |
+
mov.f32 %f46, %f43;
|
65 |
+
$L__BB0_1:
|
66 |
+
.loc 1 33 34
|
67 |
+
mul.wide.s32 %rd6, %r96, 2;
|
68 |
+
add.s64 %rd4, %rd1, %rd6;
|
69 |
+
mov.b32 %r27, 0;
|
70 |
+
.loc 1 33 63
|
71 |
+
mov.u32 %r25, 0x0;
|
72 |
+
mov.u32 %r26, 0x0;
|
73 |
+
@%p1 ld.global.L1::evict_first.v2.b32 { %r25, %r26 }, [ %rd4 + 0 ];
|
74 |
+
@!%p1 mov.u32 %r25, %r27;
|
75 |
+
@!%p1 mov.u32 %r26, %r27;
|
76 |
+
cvt.u16.u32 %rs1, %r25;
|
77 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r25; }
|
78 |
+
cvt.u16.u32 %rs3, %r26;
|
79 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r26; }
|
80 |
+
.loc 1 33 115
|
81 |
+
cvt.f32.bf16 %r29, %rs1;
|
82 |
+
mov.b32 %f13, %r29;
|
83 |
+
cvt.f32.bf16 %r30, %rs2;
|
84 |
+
mov.b32 %f14, %r30;
|
85 |
+
cvt.f32.bf16 %r31, %rs3;
|
86 |
+
mov.b32 %f15, %r31;
|
87 |
+
cvt.f32.bf16 %r32, %rs4;
|
88 |
+
mov.b32 %f16, %r32;
|
89 |
+
.loc 1 34 34
|
90 |
+
mul.wide.s32 %rd7, %r96, 4;
|
91 |
+
add.s64 %rd5, %rd2, %rd7;
|
92 |
+
.loc 1 34 63
|
93 |
+
mov.u32 %r33, 0x0;
|
94 |
+
mov.u32 %r34, 0x0;
|
95 |
+
mov.u32 %r35, 0x0;
|
96 |
+
mov.u32 %r36, 0x0;
|
97 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ];
|
98 |
+
@!%p1 mov.u32 %r33, %r27;
|
99 |
+
@!%p1 mov.u32 %r34, %r27;
|
100 |
+
@!%p1 mov.u32 %r35, %r27;
|
101 |
+
@!%p1 mov.u32 %r36, %r27;
|
102 |
+
mov.b32 %f17, %r33;
|
103 |
+
mov.b32 %f18, %r34;
|
104 |
+
mov.b32 %f19, %r35;
|
105 |
+
mov.b32 %f20, %r36;
|
106 |
+
.loc 1 39 38
|
107 |
+
fma.rn.f32 %f46, %f16, %f20, %f46;
|
108 |
+
fma.rn.f32 %f45, %f15, %f19, %f45;
|
109 |
+
fma.rn.f32 %f44, %f14, %f18, %f44;
|
110 |
+
fma.rn.f32 %f43, %f13, %f17, %f43;
|
111 |
+
.loc 1 29 36
|
112 |
+
add.s32 %r97, %r97, 8;
|
113 |
+
add.s32 %r96, %r96, 2048;
|
114 |
+
setp.lt.u32 %p9, %r97, 120;
|
115 |
+
@%p9 bra $L__BB0_1;
|
116 |
+
.loc 1 22 44
|
117 |
+
and.b32 %r58, %r1, 63;
|
118 |
+
.loc 1 22 23
|
119 |
+
or.b32 %r59, %r5, %r58;
|
120 |
+
$L__tmp1:
|
121 |
+
.loc 2 243 36
|
122 |
+
mov.b32 %r60, %f43;
|
123 |
+
shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1;
|
124 |
+
mov.b32 %f21, %r61;
|
125 |
+
$L__tmp2:
|
126 |
+
.loc 2 233 15
|
127 |
+
add.f32 %f22, %f43, %f21;
|
128 |
+
$L__tmp3:
|
129 |
+
.loc 2 243 36
|
130 |
+
mov.b32 %r62, %f44;
|
131 |
+
shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1;
|
132 |
+
mov.b32 %f23, %r63;
|
133 |
+
$L__tmp4:
|
134 |
+
.loc 2 233 15
|
135 |
+
add.f32 %f24, %f44, %f23;
|
136 |
+
$L__tmp5:
|
137 |
+
.loc 2 243 36
|
138 |
+
mov.b32 %r64, %f45;
|
139 |
+
shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1;
|
140 |
+
mov.b32 %f25, %r65;
|
141 |
+
$L__tmp6:
|
142 |
+
.loc 2 233 15
|
143 |
+
add.f32 %f26, %f45, %f25;
|
144 |
+
$L__tmp7:
|
145 |
+
.loc 2 243 36
|
146 |
+
mov.b32 %r66, %f46;
|
147 |
+
shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
|
148 |
+
mov.b32 %f27, %r67;
|
149 |
+
$L__tmp8:
|
150 |
+
.loc 2 233 15
|
151 |
+
add.f32 %f28, %f46, %f27;
|
152 |
+
$L__tmp9:
|
153 |
+
.loc 2 243 36
|
154 |
+
setp.lt.u32 %p10, %r2, 16;
|
155 |
+
shl.b32 %r68, %r3, 2;
|
156 |
+
or.b32 %r69, %r68, %r4;
|
157 |
+
shl.b32 %r70, %r69, 2;
|
158 |
+
mov.u32 %r71, global_smem;
|
159 |
+
add.s32 %r41, %r71, %r70;
|
160 |
+
mov.b32 %r42, %f22;
|
161 |
+
@%p10 st.shared.b32 [ %r41 + 0 ], %r42;
|
162 |
+
shl.b32 %r72, %r4, 2;
|
163 |
+
shl.b32 %r73, %r3, 4;
|
164 |
+
or.b32 %r74, %r73, 16;
|
165 |
+
or.b32 %r75, %r74, %r72;
|
166 |
+
add.s32 %r43, %r71, %r75;
|
167 |
+
mov.b32 %r44, %f24;
|
168 |
+
@%p10 st.shared.b32 [ %r43 + 0 ], %r44;
|
169 |
+
or.b32 %r76, %r73, 32;
|
170 |
+
or.b32 %r77, %r76, %r72;
|
171 |
+
add.s32 %r45, %r71, %r77;
|
172 |
+
mov.b32 %r46, %f26;
|
173 |
+
@%p10 st.shared.b32 [ %r45 + 0 ], %r46;
|
174 |
+
or.b32 %r78, %r73, 48;
|
175 |
+
or.b32 %r79, %r78, %r72;
|
176 |
+
add.s32 %r47, %r71, %r79;
|
177 |
+
mov.b32 %r48, %f28;
|
178 |
+
@%p10 st.shared.b32 [ %r47 + 0 ], %r48;
|
179 |
+
bar.sync 0;
|
180 |
+
setp.lt.s32 %p14, %r1, 256;
|
181 |
+
add.s32 %r50, %r71, %r13;
|
182 |
+
@%p14 ld.shared.b32 %r49, [ %r50 + 0 ];
|
183 |
+
mov.b32 %f29, %r49;
|
184 |
+
shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1;
|
185 |
+
mov.b32 %f30, %r81;
|
186 |
+
$L__tmp10:
|
187 |
+
.loc 2 233 15
|
188 |
+
add.f32 %f31, %f29, %f30;
|
189 |
+
$L__tmp11:
|
190 |
+
.loc 2 243 36
|
191 |
+
mov.b32 %r82, %f31;
|
192 |
+
shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
|
193 |
+
mov.b32 %f32, %r83;
|
194 |
+
$L__tmp12:
|
195 |
+
.loc 2 233 15
|
196 |
+
add.f32 %f33, %f31, %f32;
|
197 |
+
$L__tmp13:
|
198 |
+
.loc 2 243 36
|
199 |
+
and.b32 %r84, %r1, 3;
|
200 |
+
setp.eq.s32 %p19, %r84, 0;
|
201 |
+
and.pred %p15, %p14, %p19;
|
202 |
+
mov.b32 %r52, %f33;
|
203 |
+
@%p15 st.shared.b32 [ %r50 + 0 ], %r52;
|
204 |
+
add.s32 %r54, %r50, 512;
|
205 |
+
@%p14 ld.shared.b32 %r53, [ %r54 + 0 ];
|
206 |
+
mov.b32 %f34, %r53;
|
207 |
+
shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1;
|
208 |
+
mov.b32 %f35, %r85;
|
209 |
+
$L__tmp14:
|
210 |
+
.loc 2 233 15
|
211 |
+
add.f32 %f36, %f34, %f35;
|
212 |
+
$L__tmp15:
|
213 |
+
.loc 2 243 36
|
214 |
+
mov.b32 %r86, %f36;
|
215 |
+
shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
|
216 |
+
mov.b32 %f37, %r87;
|
217 |
+
$L__tmp16:
|
218 |
+
.loc 2 233 15
|
219 |
+
add.f32 %f38, %f36, %f37;
|
220 |
+
$L__tmp17:
|
221 |
+
.loc 2 243 36
|
222 |
+
mov.b32 %r56, %f38;
|
223 |
+
@%p15 st.shared.b32 [ %r54 + 0 ], %r56;
|
224 |
+
bar.sync 0;
|
225 |
+
add.s32 %r88, %r71, %r73;
|
226 |
+
ld.shared.f32 %f39, [%r88];
|
227 |
+
add.s32 %r89, %r71, %r74;
|
228 |
+
ld.shared.f32 %f40, [%r89];
|
229 |
+
add.s32 %r90, %r71, %r76;
|
230 |
+
ld.shared.f32 %f41, [%r90];
|
231 |
+
add.s32 %r91, %r71, %r78;
|
232 |
+
ld.shared.f32 %f42, [%r91];
|
233 |
+
$L__tmp18:
|
234 |
+
.loc 1 40 28
|
235 |
+
bar.sync 0;
|
236 |
+
add.s32 %r92, %r71, %r68;
|
237 |
+
st.shared.f32 [%r92], %f39;
|
238 |
+
st.shared.f32 [%r92+4], %f40;
|
239 |
+
st.shared.f32 [%r92+8], %f41;
|
240 |
+
st.shared.f32 [%r92+12], %f42;
|
241 |
+
bar.sync 0;
|
242 |
+
shl.b32 %r93, %r58, 2;
|
243 |
+
add.s32 %r94, %r71, %r93;
|
244 |
+
ld.shared.u32 %r57, [%r94];
|
245 |
+
.loc 1 41 25
|
246 |
+
mul.wide.s32 %rd9, %r59, 4;
|
247 |
+
add.s64 %rd8, %rd3, %rd9;
|
248 |
+
.loc 1 41 36
|
249 |
+
and.b32 %r95, %r1, 64;
|
250 |
+
setp.eq.s32 %p18, %r95, 0;
|
251 |
+
@%p18 st.global.b32 [ %rd8 + 0 ], { %r57 };
|
252 |
+
.loc 1 41 4
|
253 |
+
ret;
|
254 |
+
$L__tmp19:
|
255 |
+
$L__func_end0:
|
256 |
+
|
257 |
+
}
|
258 |
+
.file 1 "/tmp/torchinductor_root/sj/csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py"
|
259 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
260 |
+
.section .debug_abbrev
|
261 |
+
{
|
262 |
+
.b8 1
|
263 |
+
.b8 17
|
264 |
+
.b8 1
|
265 |
+
.b8 37
|
266 |
+
.b8 8
|
267 |
+
.b8 19
|
268 |
+
.b8 5
|
269 |
+
.b8 3
|
270 |
+
.b8 8
|
271 |
+
.b8 16
|
272 |
+
.b8 6
|
273 |
+
.b8 27
|
274 |
+
.b8 8
|
275 |
+
.b8 180
|
276 |
+
.b8 66
|
277 |
+
.b8 12
|
278 |
+
.b8 17
|
279 |
+
.b8 1
|
280 |
+
.b8 18
|
281 |
+
.b8 1
|
282 |
+
.b8 0
|
283 |
+
.b8 0
|
284 |
+
.b8 2
|
285 |
+
.b8 46
|
286 |
+
.b8 0
|
287 |
+
.b8 135
|
288 |
+
.b8 64
|
289 |
+
.b8 8
|
290 |
+
.b8 3
|
291 |
+
.b8 8
|
292 |
+
.b8 58
|
293 |
+
.b8 11
|
294 |
+
.b8 59
|
295 |
+
.b8 11
|
296 |
+
.b8 63
|
297 |
+
.b8 12
|
298 |
+
.b8 32
|
299 |
+
.b8 11
|
300 |
+
.b8 0
|
301 |
+
.b8 0
|
302 |
+
.b8 3
|
303 |
+
.b8 46
|
304 |
+
.b8 1
|
305 |
+
.b8 17
|
306 |
+
.b8 1
|
307 |
+
.b8 18
|
308 |
+
.b8 1
|
309 |
+
.b8 64
|
310 |
+
.b8 10
|
311 |
+
.b8 49
|
312 |
+
.b8 19
|
313 |
+
.b8 0
|
314 |
+
.b8 0
|
315 |
+
.b8 4
|
316 |
+
.b8 29
|
317 |
+
.b8 0
|
318 |
+
.b8 49
|
319 |
+
.b8 19
|
320 |
+
.b8 17
|
321 |
+
.b8 1
|
322 |
+
.b8 18
|
323 |
+
.b8 1
|
324 |
+
.b8 88
|
325 |
+
.b8 11
|
326 |
+
.b8 89
|
327 |
+
.b8 11
|
328 |
+
.b8 87
|
329 |
+
.b8 11
|
330 |
+
.b8 0
|
331 |
+
.b8 0
|
332 |
+
.b8 5
|
333 |
+
.b8 29
|
334 |
+
.b8 1
|
335 |
+
.b8 49
|
336 |
+
.b8 19
|
337 |
+
.b8 17
|
338 |
+
.b8 1
|
339 |
+
.b8 18
|
340 |
+
.b8 1
|
341 |
+
.b8 88
|
342 |
+
.b8 11
|
343 |
+
.b8 89
|
344 |
+
.b8 11
|
345 |
+
.b8 87
|
346 |
+
.b8 11
|
347 |
+
.b8 0
|
348 |
+
.b8 0
|
349 |
+
.b8 0
|
350 |
+
}
|
351 |
+
.section .debug_info
|
352 |
+
{
|
353 |
+
.b32 266
|
354 |
+
.b8 2
|
355 |
+
.b8 0
|
356 |
+
.b32 .debug_abbrev
|
357 |
+
.b8 8
|
358 |
+
.b8 1
|
359 |
+
.b8 116
|
360 |
+
.b8 114
|
361 |
+
.b8 105
|
362 |
+
.b8 116
|
363 |
+
.b8 111
|
364 |
+
.b8 110
|
365 |
+
.b8 0
|
366 |
+
.b8 2
|
367 |
+
.b8 0
|
368 |
+
.b8 99
|
369 |
+
.b8 115
|
370 |
+
.b8 106
|
371 |
+
.b8 100
|
372 |
+
.b8 55
|
373 |
+
.b8 109
|
374 |
+
.b8 108
|
375 |
+
.b8 114
|
376 |
+
.b8 106
|
377 |
+
.b8 117
|
378 |
+
.b8 106
|
379 |
+
.b8 100
|
380 |
+
.b8 52
|
381 |
+
.b8 117
|
382 |
+
.b8 119
|
383 |
+
.b8 122
|
384 |
+
.b8 101
|
385 |
+
.b8 53
|
386 |
+
.b8 116
|
387 |
+
.b8 107
|
388 |
+
.b8 103
|
389 |
+
.b8 55
|
390 |
+
.b8 112
|
391 |
+
.b8 116
|
392 |
+
.b8 116
|
393 |
+
.b8 101
|
394 |
+
.b8 97
|
395 |
+
.b8 103
|
396 |
+
.b8 112
|
397 |
+
.b8 105
|
398 |
+
.b8 104
|
399 |
+
.b8 103
|
400 |
+
.b8 116
|
401 |
+
.b8 53
|
402 |
+
.b8 122
|
403 |
+
.b8 116
|
404 |
+
.b8 97
|
405 |
+
.b8 116
|
406 |
+
.b8 102
|
407 |
+
.b8 113
|
408 |
+
.b8 99
|
409 |
+
.b8 104
|
410 |
+
.b8 112
|
411 |
+
.b8 114
|
412 |
+
.b8 99
|
413 |
+
.b8 114
|
414 |
+
.b8 97
|
415 |
+
.b8 120
|
416 |
+
.b8 50
|
417 |
+
.b8 50
|
418 |
+
.b8 108
|
419 |
+
.b8 115
|
420 |
+
.b8 46
|
421 |
+
.b8 112
|
422 |
+
.b8 121
|
423 |
+
.b8 0
|
424 |
+
.b32 .debug_line
|
425 |
+
.b8 47
|
426 |
+
.b8 116
|
427 |
+
.b8 109
|
428 |
+
.b8 112
|
429 |
+
.b8 47
|
430 |
+
.b8 116
|
431 |
+
.b8 111
|
432 |
+
.b8 114
|
433 |
+
.b8 99
|
434 |
+
.b8 104
|
435 |
+
.b8 105
|
436 |
+
.b8 110
|
437 |
+
.b8 100
|
438 |
+
.b8 117
|
439 |
+
.b8 99
|
440 |
+
.b8 116
|
441 |
+
.b8 111
|
442 |
+
.b8 114
|
443 |
+
.b8 95
|
444 |
+
.b8 114
|
445 |
+
.b8 111
|
446 |
+
.b8 111
|
447 |
+
.b8 116
|
448 |
+
.b8 47
|
449 |
+
.b8 115
|
450 |
+
.b8 106
|
451 |
+
.b8 0
|
452 |
+
.b8 1
|
453 |
+
.b64 $L__func_begin0
|
454 |
+
.b64 $L__func_end0
|
455 |
+
.b8 2
|
456 |
+
.b8 116
|
457 |
+
.b8 114
|
458 |
+
.b8 105
|
459 |
+
.b8 116
|
460 |
+
.b8 111
|
461 |
+
.b8 110
|
462 |
+
.b8 95
|
463 |
+
.b8 95
|
464 |
+
.b8 48
|
465 |
+
.b8 100
|
466 |
+
.b8 49
|
467 |
+
.b8 100
|
468 |
+
.b8 50
|
469 |
+
.b8 100
|
470 |
+
.b8 51
|
471 |
+
.b8 100
|
472 |
+
.b8 101
|
473 |
+
.b8 52
|
474 |
+
.b8 100
|
475 |
+
.b8 101
|
476 |
+
.b8 0
|
477 |
+
.b8 116
|
478 |
+
.b8 114
|
479 |
+
.b8 105
|
480 |
+
.b8 116
|
481 |
+
.b8 111
|
482 |
+
.b8 110
|
483 |
+
.b8 95
|
484 |
+
.b8 95
|
485 |
+
.b8 48
|
486 |
+
.b8 100
|
487 |
+
.b8 49
|
488 |
+
.b8 100
|
489 |
+
.b8 50
|
490 |
+
.b8 100
|
491 |
+
.b8 51
|
492 |
+
.b8 100
|
493 |
+
.b8 101
|
494 |
+
.b8 52
|
495 |
+
.b8 100
|
496 |
+
.b8 101
|
497 |
+
.b8 0
|
498 |
+
.b8 1
|
499 |
+
.b8 18
|
500 |
+
.b8 1
|
501 |
+
.b8 1
|
502 |
+
.b8 3
|
503 |
+
.b64 $L__func_begin0
|
504 |
+
.b64 $L__func_end0
|
505 |
+
.b8 1
|
506 |
+
.b8 156
|
507 |
+
.b32 125
|
508 |
+
.b8 4
|
509 |
+
.b32 125
|
510 |
+
.b64 $L__tmp1
|
511 |
+
.b64 $L__tmp18
|
512 |
+
.b8 2
|
513 |
+
.b8 40
|
514 |
+
.b8 25
|
515 |
+
.b8 5
|
516 |
+
.b32 125
|
517 |
+
.b64 $L__tmp2
|
518 |
+
.b64 $L__tmp17
|
519 |
+
.b8 2
|
520 |
+
.b8 40
|
521 |
+
.b8 25
|
522 |
+
.b8 4
|
523 |
+
.b32 125
|
524 |
+
.b64 $L__tmp2
|
525 |
+
.b64 $L__tmp17
|
526 |
+
.b8 2
|
527 |
+
.b8 243
|
528 |
+
.b8 36
|
529 |
+
.b8 0
|
530 |
+
.b8 0
|
531 |
+
.b8 0
|
532 |
+
}
|
533 |
+
.section .debug_pubnames
|
534 |
+
{
|
535 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
536 |
+
$L__pubNames_start0:
|
537 |
+
.b8 2
|
538 |
+
.b8 0
|
539 |
+
.b32 .debug_info
|
540 |
+
.b32 270
|
541 |
+
.b32 125
|
542 |
+
.b8 116
|
543 |
+
.b8 114
|
544 |
+
.b8 105
|
545 |
+
.b8 116
|
546 |
+
.b8 111
|
547 |
+
.b8 110
|
548 |
+
.b8 95
|
549 |
+
.b8 95
|
550 |
+
.b8 48
|
551 |
+
.b8 100
|
552 |
+
.b8 49
|
553 |
+
.b8 100
|
554 |
+
.b8 50
|
555 |
+
.b8 100
|
556 |
+
.b8 51
|
557 |
+
.b8 100
|
558 |
+
.b8 101
|
559 |
+
.b8 52
|
560 |
+
.b8 100
|
561 |
+
.b8 101
|
562 |
+
.b8 0
|
563 |
+
.b32 0
|
564 |
+
$L__pubNames_end0:
|
565 |
+
}
|
566 |
+
.section .debug_pubtypes
|
567 |
+
{
|
568 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
569 |
+
$L__pubTypes_start0:
|
570 |
+
.b8 2
|
571 |
+
.b8 0
|
572 |
+
.b32 .debug_info
|
573 |
+
.b32 270
|
574 |
+
.b32 0
|
575 |
+
$L__pubTypes_end0:
|
576 |
+
}
|
577 |
+
.section .debug_loc { }
|
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
|
9 |
+
%c0_i32 = arith.constant 0 : i32
|
10 |
+
%c128_i32 = arith.constant 128 : i32
|
11 |
+
%c8_i32 = arith.constant 8 : i32
|
12 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
|
13 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
|
14 |
+
%c64_i32 = arith.constant 64 : i32
|
15 |
+
%0 = tt.get_program_id x : i32
|
16 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
17 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
18 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
19 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
20 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
21 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
22 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
23 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
|
24 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
|
25 |
+
%10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
26 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
|
27 |
+
%12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
|
28 |
+
%13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked>
|
29 |
+
%14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
30 |
+
%15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked>
|
31 |
+
%16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
32 |
+
%17 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
33 |
+
%18 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
34 |
+
%19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 {
|
35 |
+
%25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked>
|
36 |
+
%26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked>
|
37 |
+
%27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked>
|
38 |
+
%28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked>
|
39 |
+
%29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
40 |
+
%30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked>
|
41 |
+
%31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked>
|
42 |
+
%32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
43 |
+
%33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
|
44 |
+
%34 = tt.load %32, %33, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
|
45 |
+
%35 = arith.extf %34 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
|
46 |
+
%36 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
47 |
+
%37 = tt.load %36, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
48 |
+
%38 = arith.mulf %35, %37 : tensor<64x8xf32, #blocked>
|
49 |
+
%39 = arith.addf %arg6, %38 : tensor<64x8xf32, #blocked>
|
50 |
+
%40 = arith.select %33, %39, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
|
51 |
+
scf.yield %40 : tensor<64x8xf32, #blocked>
|
52 |
+
}
|
53 |
+
%20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({
|
54 |
+
^bb0(%arg5: f32, %arg6: f32):
|
55 |
+
%25 = arith.addf %arg5, %arg6 : f32
|
56 |
+
tt.reduce.return %25 : f32
|
57 |
+
}) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
58 |
+
%21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
59 |
+
%22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1>
|
60 |
+
%23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked1>
|
61 |
+
%24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr<f32, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
|
62 |
+
tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1>
|
63 |
+
tt.return
|
64 |
+
}
|
65 |
+
}
|
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin
ADDED
Binary file (4.52 kB). View file
|
|
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx
ADDED
@@ -0,0 +1,743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7de8de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3d4d5d6d7de8de(
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_4,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_5,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_6,
|
20 |
+
.param .u32 triton__0d1d2d3d4d5d6d7de8de_param_7,
|
21 |
+
.param .u32 triton__0d1d2d3d4d5d6d7de8de_param_8
|
22 |
+
)
|
23 |
+
.maxntid 64, 1, 1
|
24 |
+
{
|
25 |
+
.reg .pred %p<37>;
|
26 |
+
.reg .b16 %rs<9>;
|
27 |
+
.reg .b32 %r<110>;
|
28 |
+
.reg .f32 %f<86>;
|
29 |
+
.reg .b64 %rd<26>;
|
30 |
+
.loc 1 18 0
|
31 |
+
$L__func_begin0:
|
32 |
+
.loc 1 18 0
|
33 |
+
|
34 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8de_param_0];
|
35 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8de_param_1];
|
36 |
+
$L__tmp0:
|
37 |
+
.loc 1 26 26
|
38 |
+
mov.u32 %r76, %tid.x;
|
39 |
+
and.b32 %r77, %r76, 31;
|
40 |
+
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8de_param_2];
|
41 |
+
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8de_param_3];
|
42 |
+
ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8de_param_4];
|
43 |
+
shl.b32 %r78, %r76, 2;
|
44 |
+
ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8de_param_5];
|
45 |
+
and.b32 %r79, %r78, 252;
|
46 |
+
ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7de8de_param_6];
|
47 |
+
.loc 1 23 28
|
48 |
+
mov.u32 %r1, %ctaid.x;
|
49 |
+
.loc 1 30 40
|
50 |
+
shl.b32 %r80, %r1, 8;
|
51 |
+
.loc 1 30 36
|
52 |
+
or.b32 %r81, %r80, %r79;
|
53 |
+
.loc 1 30 30
|
54 |
+
mul.wide.s32 %rd22, %r81, 2;
|
55 |
+
add.s64 %rd1, %rd16, %rd22;
|
56 |
+
mov.b32 %r4, 0;
|
57 |
+
mov.pred %p1, -1;
|
58 |
+
.loc 1 30 46
|
59 |
+
mov.u32 %r2, 0x0;
|
60 |
+
mov.u32 %r3, 0x0;
|
61 |
+
@%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
|
62 |
+
@!%p1 mov.u32 %r2, %r4;
|
63 |
+
@!%p1 mov.u32 %r3, %r4;
|
64 |
+
cvt.u16.u32 %rs1, %r2;
|
65 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
66 |
+
cvt.u16.u32 %rs3, %r3;
|
67 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
68 |
+
.loc 1 30 67
|
69 |
+
cvt.f32.bf16 %r6, %rs1;
|
70 |
+
mov.b32 %f1, %r6;
|
71 |
+
cvt.f32.bf16 %r7, %rs2;
|
72 |
+
mov.b32 %f2, %r7;
|
73 |
+
cvt.f32.bf16 %r8, %rs3;
|
74 |
+
mov.b32 %f3, %r8;
|
75 |
+
cvt.f32.bf16 %r9, %rs4;
|
76 |
+
mov.b32 %f4, %r9;
|
77 |
+
.loc 1 31 30
|
78 |
+
mul.wide.u32 %rd23, %r79, 4;
|
79 |
+
add.s64 %rd2, %rd17, %rd23;
|
80 |
+
.loc 1 31 35
|
81 |
+
mov.u32 %r10, 0x0;
|
82 |
+
mov.u32 %r11, 0x0;
|
83 |
+
mov.u32 %r12, 0x0;
|
84 |
+
mov.u32 %r13, 0x0;
|
85 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
|
86 |
+
@!%p1 mov.u32 %r10, %r4;
|
87 |
+
@!%p1 mov.u32 %r11, %r4;
|
88 |
+
@!%p1 mov.u32 %r12, %r4;
|
89 |
+
@!%p1 mov.u32 %r13, %r4;
|
90 |
+
mov.b32 %f5, %r10;
|
91 |
+
mov.b32 %f6, %r11;
|
92 |
+
mov.b32 %f7, %r12;
|
93 |
+
mov.b32 %f8, %r13;
|
94 |
+
.loc 1 32 30
|
95 |
+
mul.wide.s32 %rd24, %r81, 4;
|
96 |
+
add.s64 %rd3, %rd18, %rd24;
|
97 |
+
.loc 1 32 46
|
98 |
+
mov.u32 %r18, 0x0;
|
99 |
+
mov.u32 %r19, 0x0;
|
100 |
+
mov.u32 %r20, 0x0;
|
101 |
+
mov.u32 %r21, 0x0;
|
102 |
+
@%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
|
103 |
+
@!%p1 mov.u32 %r18, %r4;
|
104 |
+
@!%p1 mov.u32 %r19, %r4;
|
105 |
+
@!%p1 mov.u32 %r20, %r4;
|
106 |
+
@!%p1 mov.u32 %r21, %r4;
|
107 |
+
mov.b32 %f9, %r18;
|
108 |
+
mov.b32 %f10, %r19;
|
109 |
+
mov.b32 %f11, %r20;
|
110 |
+
mov.b32 %f12, %r21;
|
111 |
+
.loc 1 33 30
|
112 |
+
mul.wide.s32 %rd25, %r1, 4;
|
113 |
+
add.s64 %rd4, %rd19, %rd25;
|
114 |
+
.loc 1 33 35
|
115 |
+
mov.u32 %r26, 0x0;
|
116 |
+
@%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ];
|
117 |
+
mov.b32 %f13, %r26;
|
118 |
+
mov.u32 %r27, 0x0;
|
119 |
+
@%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ];
|
120 |
+
mov.u32 %r28, 0x0;
|
121 |
+
@%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ];
|
122 |
+
mov.u32 %r29, 0x0;
|
123 |
+
@%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ];
|
124 |
+
.loc 1 34 31
|
125 |
+
add.s64 %rd8, %rd20, %rd25;
|
126 |
+
.loc 1 34 36
|
127 |
+
mov.u32 %r55, 0x0;
|
128 |
+
@%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ];
|
129 |
+
mov.b32 %f14, %r55;
|
130 |
+
mov.u32 %r31, 0x0;
|
131 |
+
@%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ];
|
132 |
+
mov.u32 %r32, 0x0;
|
133 |
+
@%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ];
|
134 |
+
mov.u32 %r33, 0x0;
|
135 |
+
@%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ];
|
136 |
+
.loc 1 35 35
|
137 |
+
add.s64 %rd12, %rd15, %rd24;
|
138 |
+
.loc 1 35 51
|
139 |
+
mov.u32 %r34, 0x0;
|
140 |
+
mov.u32 %r35, 0x0;
|
141 |
+
mov.u32 %r36, 0x0;
|
142 |
+
mov.u32 %r37, 0x0;
|
143 |
+
@%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd12 + 0 ];
|
144 |
+
@!%p1 mov.u32 %r34, %r4;
|
145 |
+
@!%p1 mov.u32 %r35, %r4;
|
146 |
+
@!%p1 mov.u32 %r36, %r4;
|
147 |
+
@!%p1 mov.u32 %r37, %r4;
|
148 |
+
mov.b32 %f15, %r34;
|
149 |
+
mov.b32 %f16, %r35;
|
150 |
+
mov.b32 %f17, %r36;
|
151 |
+
mov.b32 %f18, %r37;
|
152 |
+
.loc 1 37 18
|
153 |
+
mul.f32 %f19, %f1, %f5;
|
154 |
+
mul.f32 %f20, %f2, %f6;
|
155 |
+
mul.f32 %f21, %f3, %f7;
|
156 |
+
mul.f32 %f22, %f4, %f8;
|
157 |
+
$L__tmp1:
|
158 |
+
.loc 2 233 15
|
159 |
+
fma.rn.f32 %f23, %f1, %f5, %f20;
|
160 |
+
fma.rn.f32 %f24, %f3, %f7, %f23;
|
161 |
+
fma.rn.f32 %f25, %f4, %f8, %f24;
|
162 |
+
$L__tmp2:
|
163 |
+
.loc 2 243 36
|
164 |
+
mov.b32 %r82, %f25;
|
165 |
+
shfl.sync.bfly.b32 %r83, %r82, 16, 31, -1;
|
166 |
+
mov.b32 %f26, %r83;
|
167 |
+
$L__tmp3:
|
168 |
+
.loc 2 233 15
|
169 |
+
add.f32 %f27, %f25, %f26;
|
170 |
+
$L__tmp4:
|
171 |
+
.loc 2 243 36
|
172 |
+
mov.b32 %r84, %f27;
|
173 |
+
shfl.sync.bfly.b32 %r85, %r84, 8, 31, -1;
|
174 |
+
mov.b32 %f28, %r85;
|
175 |
+
$L__tmp5:
|
176 |
+
.loc 2 233 15
|
177 |
+
add.f32 %f29, %f27, %f28;
|
178 |
+
$L__tmp6:
|
179 |
+
.loc 2 243 36
|
180 |
+
mov.b32 %r86, %f29;
|
181 |
+
shfl.sync.bfly.b32 %r87, %r86, 4, 31, -1;
|
182 |
+
mov.b32 %f30, %r87;
|
183 |
+
$L__tmp7:
|
184 |
+
.loc 2 233 15
|
185 |
+
add.f32 %f31, %f29, %f30;
|
186 |
+
$L__tmp8:
|
187 |
+
.loc 2 243 36
|
188 |
+
mov.b32 %r88, %f31;
|
189 |
+
shfl.sync.bfly.b32 %r89, %r88, 2, 31, -1;
|
190 |
+
mov.b32 %f32, %r89;
|
191 |
+
$L__tmp9:
|
192 |
+
.loc 2 233 15
|
193 |
+
add.f32 %f33, %f31, %f32;
|
194 |
+
$L__tmp10:
|
195 |
+
.loc 2 243 36
|
196 |
+
mov.b32 %r90, %f33;
|
197 |
+
shfl.sync.bfly.b32 %r91, %r90, 1, 31, -1;
|
198 |
+
mov.b32 %f34, %r91;
|
199 |
+
$L__tmp11:
|
200 |
+
.loc 2 233 15
|
201 |
+
add.f32 %f35, %f33, %f34;
|
202 |
+
$L__tmp12:
|
203 |
+
.loc 2 243 36
|
204 |
+
setp.eq.s32 %p27, %r77, 0;
|
205 |
+
shr.u32 %r92, %r76, 3;
|
206 |
+
and.b32 %r93, %r92, 4;
|
207 |
+
mov.u32 %r94, global_smem;
|
208 |
+
add.s32 %r42, %r94, %r93;
|
209 |
+
mov.b32 %r43, %f35;
|
210 |
+
@%p27 st.shared.b32 [ %r42 + 0 ], %r43;
|
211 |
+
bar.sync 0;
|
212 |
+
setp.lt.s32 %p28, %r76, 2;
|
213 |
+
add.s32 %r45, %r94, %r78;
|
214 |
+
@%p28 ld.shared.b32 %r44, [ %r45 + 0 ];
|
215 |
+
mov.b32 %f36, %r44;
|
216 |
+
shfl.sync.bfly.b32 %r95, %r44, 1, 31, -1;
|
217 |
+
mov.b32 %f37, %r95;
|
218 |
+
$L__tmp13:
|
219 |
+
.loc 2 233 15
|
220 |
+
add.f32 %f38, %f36, %f37;
|
221 |
+
$L__tmp14:
|
222 |
+
.loc 2 243 36
|
223 |
+
and.b32 %r96, %r76, 1;
|
224 |
+
setp.eq.b32 %p35, %r96, 1;
|
225 |
+
not.pred %p36, %p35;
|
226 |
+
and.pred %p29, %p28, %p36;
|
227 |
+
mov.b32 %r47, %f38;
|
228 |
+
@%p29 st.shared.b32 [ %r45 + 0 ], %r47;
|
229 |
+
bar.sync 0;
|
230 |
+
ld.shared.f32 %f39, [global_smem];
|
231 |
+
$L__tmp15:
|
232 |
+
.loc 3 8 15
|
233 |
+
add.f32 %f40, %f39, 0f00000000;
|
234 |
+
$L__tmp16:
|
235 |
+
.loc 1 41 19
|
236 |
+
sub.f32 %f41, %f9, %f13;
|
237 |
+
sub.f32 %f42, %f10, %f13;
|
238 |
+
sub.f32 %f43, %f11, %f13;
|
239 |
+
sub.f32 %f44, %f12, %f13;
|
240 |
+
.loc 1 42 20
|
241 |
+
mul.f32 %f45, %f41, %f14;
|
242 |
+
mul.f32 %f46, %f42, %f14;
|
243 |
+
mul.f32 %f47, %f43, %f14;
|
244 |
+
mul.f32 %f48, %f44, %f14;
|
245 |
+
.loc 1 43 19
|
246 |
+
mul.f32 %f49, %f20, %f46;
|
247 |
+
$L__tmp17:
|
248 |
+
.loc 2 243 36
|
249 |
+
bar.sync 0;
|
250 |
+
$L__tmp18:
|
251 |
+
.loc 2 233 15
|
252 |
+
fma.rn.f32 %f50, %f19, %f45, %f49;
|
253 |
+
fma.rn.f32 %f51, %f21, %f47, %f50;
|
254 |
+
fma.rn.f32 %f52, %f22, %f48, %f51;
|
255 |
+
$L__tmp19:
|
256 |
+
.loc 2 243 36
|
257 |
+
mov.b32 %r97, %f52;
|
258 |
+
shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1;
|
259 |
+
mov.b32 %f53, %r98;
|
260 |
+
$L__tmp20:
|
261 |
+
.loc 2 233 15
|
262 |
+
add.f32 %f54, %f52, %f53;
|
263 |
+
$L__tmp21:
|
264 |
+
.loc 2 243 36
|
265 |
+
mov.b32 %r99, %f54;
|
266 |
+
shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1;
|
267 |
+
mov.b32 %f55, %r100;
|
268 |
+
$L__tmp22:
|
269 |
+
.loc 2 233 15
|
270 |
+
add.f32 %f56, %f54, %f55;
|
271 |
+
$L__tmp23:
|
272 |
+
.loc 2 243 36
|
273 |
+
mov.b32 %r101, %f56;
|
274 |
+
shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1;
|
275 |
+
mov.b32 %f57, %r102;
|
276 |
+
$L__tmp24:
|
277 |
+
.loc 2 233 15
|
278 |
+
add.f32 %f58, %f56, %f57;
|
279 |
+
$L__tmp25:
|
280 |
+
.loc 2 243 36
|
281 |
+
mov.b32 %r103, %f58;
|
282 |
+
shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1;
|
283 |
+
mov.b32 %f59, %r104;
|
284 |
+
$L__tmp26:
|
285 |
+
.loc 2 233 15
|
286 |
+
add.f32 %f60, %f58, %f59;
|
287 |
+
$L__tmp27:
|
288 |
+
.loc 2 243 36
|
289 |
+
mov.b32 %r105, %f60;
|
290 |
+
shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1;
|
291 |
+
mov.b32 %f61, %r106;
|
292 |
+
$L__tmp28:
|
293 |
+
.loc 2 233 15
|
294 |
+
add.f32 %f62, %f60, %f61;
|
295 |
+
$L__tmp29:
|
296 |
+
.loc 2 243 36
|
297 |
+
mov.b32 %r49, %f62;
|
298 |
+
@%p27 st.shared.b32 [ %r42 + 0 ], %r49;
|
299 |
+
bar.sync 0;
|
300 |
+
@%p28 ld.shared.b32 %r50, [ %r45 + 0 ];
|
301 |
+
mov.b32 %f63, %r50;
|
302 |
+
shfl.sync.bfly.b32 %r107, %r50, 1, 31, -1;
|
303 |
+
mov.b32 %f64, %r107;
|
304 |
+
$L__tmp30:
|
305 |
+
.loc 2 233 15
|
306 |
+
add.f32 %f65, %f63, %f64;
|
307 |
+
$L__tmp31:
|
308 |
+
.loc 2 243 36
|
309 |
+
mov.b32 %r53, %f65;
|
310 |
+
@%p29 st.shared.b32 [ %r45 + 0 ], %r53;
|
311 |
+
bar.sync 0;
|
312 |
+
ld.shared.f32 %f66, [global_smem];
|
313 |
+
$L__tmp32:
|
314 |
+
.loc 3 8 15
|
315 |
+
add.f32 %f67, %f66, 0f00000000;
|
316 |
+
mov.b32 %r56, 1132462080;
|
317 |
+
$L__tmp33:
|
318 |
+
.loc 1 48 20
|
319 |
+
div.full.f32 %r54, %r55, %r56;
|
320 |
+
mov.b32 %f68, %r54;
|
321 |
+
.loc 1 50 20
|
322 |
+
neg.f32 %f69, %f40;
|
323 |
+
fma.rn.f32 %f70, %f19, 0f43800000, %f69;
|
324 |
+
fma.rn.f32 %f71, %f20, 0f43800000, %f69;
|
325 |
+
fma.rn.f32 %f72, %f21, 0f43800000, %f69;
|
326 |
+
fma.rn.f32 %f73, %f22, 0f43800000, %f69;
|
327 |
+
.loc 1 52 20
|
328 |
+
neg.f32 %f74, %f45;
|
329 |
+
fma.rn.f32 %f75, %f74, %f67, %f70;
|
330 |
+
neg.f32 %f76, %f46;
|
331 |
+
fma.rn.f32 %f77, %f76, %f67, %f71;
|
332 |
+
neg.f32 %f78, %f47;
|
333 |
+
fma.rn.f32 %f79, %f78, %f67, %f72;
|
334 |
+
neg.f32 %f80, %f48;
|
335 |
+
fma.rn.f32 %f81, %f80, %f67, %f73;
|
336 |
+
.loc 1 54 20
|
337 |
+
fma.rn.f32 %f82, %f68, %f75, %f15;
|
338 |
+
fma.rn.f32 %f83, %f68, %f77, %f16;
|
339 |
+
fma.rn.f32 %f84, %f68, %f79, %f17;
|
340 |
+
fma.rn.f32 %f85, %f68, %f81, %f18;
|
341 |
+
.loc 1 56 51
|
342 |
+
mov.b32 %r66, %f82;
|
343 |
+
mov.b32 %r67, %f83;
|
344 |
+
mov.b32 %r68, %f84;
|
345 |
+
mov.b32 %r69, %f85;
|
346 |
+
@%p1 st.global.v4.b32 [ %rd12 + 0 ], { %r66, %r67, %r68, %r69 };
|
347 |
+
.loc 1 57 25
|
348 |
+
add.s64 %rd14, %rd21, %rd22;
|
349 |
+
.loc 1 57 48
|
350 |
+
cvt.rn.bf16.f32 %rs5, %r66;
|
351 |
+
cvt.rn.bf16.f32 %rs6, %r67;
|
352 |
+
cvt.rn.bf16.f32 %rs7, %r68;
|
353 |
+
cvt.rn.bf16.f32 %rs8, %r69;
|
354 |
+
mov.b32 %r108, {%rs5, %rs6};
|
355 |
+
mov.b32 %r109, {%rs7, %rs8};
|
356 |
+
@%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r108, %r109 };
|
357 |
+
.loc 1 57 4
|
358 |
+
ret;
|
359 |
+
$L__tmp34:
|
360 |
+
$L__func_end0:
|
361 |
+
|
362 |
+
}
|
363 |
+
.file 1 "/tmp/torchinductor_root/sn/csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py"
|
364 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
365 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
366 |
+
.section .debug_abbrev
|
367 |
+
{
|
368 |
+
.b8 1
|
369 |
+
.b8 17
|
370 |
+
.b8 1
|
371 |
+
.b8 37
|
372 |
+
.b8 8
|
373 |
+
.b8 19
|
374 |
+
.b8 5
|
375 |
+
.b8 3
|
376 |
+
.b8 8
|
377 |
+
.b8 16
|
378 |
+
.b8 6
|
379 |
+
.b8 27
|
380 |
+
.b8 8
|
381 |
+
.b8 180
|
382 |
+
.b8 66
|
383 |
+
.b8 12
|
384 |
+
.b8 17
|
385 |
+
.b8 1
|
386 |
+
.b8 18
|
387 |
+
.b8 1
|
388 |
+
.b8 0
|
389 |
+
.b8 0
|
390 |
+
.b8 2
|
391 |
+
.b8 46
|
392 |
+
.b8 0
|
393 |
+
.b8 135
|
394 |
+
.b8 64
|
395 |
+
.b8 8
|
396 |
+
.b8 3
|
397 |
+
.b8 8
|
398 |
+
.b8 58
|
399 |
+
.b8 11
|
400 |
+
.b8 59
|
401 |
+
.b8 11
|
402 |
+
.b8 63
|
403 |
+
.b8 12
|
404 |
+
.b8 32
|
405 |
+
.b8 11
|
406 |
+
.b8 0
|
407 |
+
.b8 0
|
408 |
+
.b8 3
|
409 |
+
.b8 46
|
410 |
+
.b8 1
|
411 |
+
.b8 17
|
412 |
+
.b8 1
|
413 |
+
.b8 18
|
414 |
+
.b8 1
|
415 |
+
.b8 64
|
416 |
+
.b8 10
|
417 |
+
.b8 49
|
418 |
+
.b8 19
|
419 |
+
.b8 0
|
420 |
+
.b8 0
|
421 |
+
.b8 4
|
422 |
+
.b8 29
|
423 |
+
.b8 1
|
424 |
+
.b8 49
|
425 |
+
.b8 19
|
426 |
+
.b8 17
|
427 |
+
.b8 1
|
428 |
+
.b8 18
|
429 |
+
.b8 1
|
430 |
+
.b8 88
|
431 |
+
.b8 11
|
432 |
+
.b8 89
|
433 |
+
.b8 11
|
434 |
+
.b8 87
|
435 |
+
.b8 11
|
436 |
+
.b8 0
|
437 |
+
.b8 0
|
438 |
+
.b8 5
|
439 |
+
.b8 29
|
440 |
+
.b8 0
|
441 |
+
.b8 49
|
442 |
+
.b8 19
|
443 |
+
.b8 17
|
444 |
+
.b8 1
|
445 |
+
.b8 18
|
446 |
+
.b8 1
|
447 |
+
.b8 88
|
448 |
+
.b8 11
|
449 |
+
.b8 89
|
450 |
+
.b8 11
|
451 |
+
.b8 87
|
452 |
+
.b8 11
|
453 |
+
.b8 0
|
454 |
+
.b8 0
|
455 |
+
.b8 0
|
456 |
+
}
|
457 |
+
.section .debug_info
|
458 |
+
{
|
459 |
+
.b32 403
|
460 |
+
.b8 2
|
461 |
+
.b8 0
|
462 |
+
.b32 .debug_abbrev
|
463 |
+
.b8 8
|
464 |
+
.b8 1
|
465 |
+
.b8 116
|
466 |
+
.b8 114
|
467 |
+
.b8 105
|
468 |
+
.b8 116
|
469 |
+
.b8 111
|
470 |
+
.b8 110
|
471 |
+
.b8 0
|
472 |
+
.b8 2
|
473 |
+
.b8 0
|
474 |
+
.b8 99
|
475 |
+
.b8 115
|
476 |
+
.b8 110
|
477 |
+
.b8 101
|
478 |
+
.b8 100
|
479 |
+
.b8 52
|
480 |
+
.b8 104
|
481 |
+
.b8 121
|
482 |
+
.b8 120
|
483 |
+
.b8 112
|
484 |
+
.b8 103
|
485 |
+
.b8 119
|
486 |
+
.b8 117
|
487 |
+
.b8 53
|
488 |
+
.b8 116
|
489 |
+
.b8 116
|
490 |
+
.b8 117
|
491 |
+
.b8 98
|
492 |
+
.b8 115
|
493 |
+
.b8 51
|
494 |
+
.b8 114
|
495 |
+
.b8 55
|
496 |
+
.b8 117
|
497 |
+
.b8 120
|
498 |
+
.b8 107
|
499 |
+
.b8 106
|
500 |
+
.b8 113
|
501 |
+
.b8 53
|
502 |
+
.b8 121
|
503 |
+
.b8 102
|
504 |
+
.b8 108
|
505 |
+
.b8 51
|
506 |
+
.b8 122
|
507 |
+
.b8 104
|
508 |
+
.b8 54
|
509 |
+
.b8 99
|
510 |
+
.b8 50
|
511 |
+
.b8 115
|
512 |
+
.b8 111
|
513 |
+
.b8 122
|
514 |
+
.b8 111
|
515 |
+
.b8 98
|
516 |
+
.b8 116
|
517 |
+
.b8 107
|
518 |
+
.b8 101
|
519 |
+
.b8 107
|
520 |
+
.b8 50
|
521 |
+
.b8 117
|
522 |
+
.b8 122
|
523 |
+
.b8 102
|
524 |
+
.b8 99
|
525 |
+
.b8 118
|
526 |
+
.b8 46
|
527 |
+
.b8 112
|
528 |
+
.b8 121
|
529 |
+
.b8 0
|
530 |
+
.b32 .debug_line
|
531 |
+
.b8 47
|
532 |
+
.b8 116
|
533 |
+
.b8 109
|
534 |
+
.b8 112
|
535 |
+
.b8 47
|
536 |
+
.b8 116
|
537 |
+
.b8 111
|
538 |
+
.b8 114
|
539 |
+
.b8 99
|
540 |
+
.b8 104
|
541 |
+
.b8 105
|
542 |
+
.b8 110
|
543 |
+
.b8 100
|
544 |
+
.b8 117
|
545 |
+
.b8 99
|
546 |
+
.b8 116
|
547 |
+
.b8 111
|
548 |
+
.b8 114
|
549 |
+
.b8 95
|
550 |
+
.b8 114
|
551 |
+
.b8 111
|
552 |
+
.b8 111
|
553 |
+
.b8 116
|
554 |
+
.b8 47
|
555 |
+
.b8 115
|
556 |
+
.b8 110
|
557 |
+
.b8 0
|
558 |
+
.b8 1
|
559 |
+
.b64 $L__func_begin0
|
560 |
+
.b64 $L__func_end0
|
561 |
+
.b8 2
|
562 |
+
.b8 116
|
563 |
+
.b8 114
|
564 |
+
.b8 105
|
565 |
+
.b8 116
|
566 |
+
.b8 111
|
567 |
+
.b8 110
|
568 |
+
.b8 95
|
569 |
+
.b8 95
|
570 |
+
.b8 48
|
571 |
+
.b8 100
|
572 |
+
.b8 49
|
573 |
+
.b8 100
|
574 |
+
.b8 50
|
575 |
+
.b8 100
|
576 |
+
.b8 51
|
577 |
+
.b8 100
|
578 |
+
.b8 52
|
579 |
+
.b8 100
|
580 |
+
.b8 53
|
581 |
+
.b8 100
|
582 |
+
.b8 54
|
583 |
+
.b8 100
|
584 |
+
.b8 55
|
585 |
+
.b8 100
|
586 |
+
.b8 101
|
587 |
+
.b8 56
|
588 |
+
.b8 100
|
589 |
+
.b8 101
|
590 |
+
.b8 0
|
591 |
+
.b8 116
|
592 |
+
.b8 114
|
593 |
+
.b8 105
|
594 |
+
.b8 116
|
595 |
+
.b8 111
|
596 |
+
.b8 110
|
597 |
+
.b8 95
|
598 |
+
.b8 95
|
599 |
+
.b8 48
|
600 |
+
.b8 100
|
601 |
+
.b8 49
|
602 |
+
.b8 100
|
603 |
+
.b8 50
|
604 |
+
.b8 100
|
605 |
+
.b8 51
|
606 |
+
.b8 100
|
607 |
+
.b8 52
|
608 |
+
.b8 100
|
609 |
+
.b8 53
|
610 |
+
.b8 100
|
611 |
+
.b8 54
|
612 |
+
.b8 100
|
613 |
+
.b8 55
|
614 |
+
.b8 100
|
615 |
+
.b8 101
|
616 |
+
.b8 56
|
617 |
+
.b8 100
|
618 |
+
.b8 101
|
619 |
+
.b8 0
|
620 |
+
.b8 1
|
621 |
+
.b8 18
|
622 |
+
.b8 1
|
623 |
+
.b8 1
|
624 |
+
.b8 3
|
625 |
+
.b64 $L__func_begin0
|
626 |
+
.b64 $L__func_end0
|
627 |
+
.b8 1
|
628 |
+
.b8 156
|
629 |
+
.b32 125
|
630 |
+
.b8 4
|
631 |
+
.b32 125
|
632 |
+
.b64 $L__tmp1
|
633 |
+
.b64 $L__tmp14
|
634 |
+
.b8 2
|
635 |
+
.b8 40
|
636 |
+
.b8 57
|
637 |
+
.b8 5
|
638 |
+
.b32 125
|
639 |
+
.b64 $L__tmp1
|
640 |
+
.b64 $L__tmp14
|
641 |
+
.b8 2
|
642 |
+
.b8 243
|
643 |
+
.b8 36
|
644 |
+
.b8 0
|
645 |
+
.b8 5
|
646 |
+
.b32 125
|
647 |
+
.b64 $L__tmp2
|
648 |
+
.b64 $L__tmp15
|
649 |
+
.b8 2
|
650 |
+
.b8 40
|
651 |
+
.b8 57
|
652 |
+
.b8 5
|
653 |
+
.b32 125
|
654 |
+
.b64 $L__tmp15
|
655 |
+
.b64 $L__tmp16
|
656 |
+
.b8 3
|
657 |
+
.b8 40
|
658 |
+
.b8 44
|
659 |
+
.b8 5
|
660 |
+
.b32 125
|
661 |
+
.b64 $L__tmp17
|
662 |
+
.b64 $L__tmp32
|
663 |
+
.b8 2
|
664 |
+
.b8 46
|
665 |
+
.b8 59
|
666 |
+
.b8 4
|
667 |
+
.b32 125
|
668 |
+
.b64 $L__tmp18
|
669 |
+
.b64 $L__tmp31
|
670 |
+
.b8 2
|
671 |
+
.b8 46
|
672 |
+
.b8 59
|
673 |
+
.b8 5
|
674 |
+
.b32 125
|
675 |
+
.b64 $L__tmp18
|
676 |
+
.b64 $L__tmp31
|
677 |
+
.b8 2
|
678 |
+
.b8 243
|
679 |
+
.b8 36
|
680 |
+
.b8 0
|
681 |
+
.b8 5
|
682 |
+
.b32 125
|
683 |
+
.b64 $L__tmp32
|
684 |
+
.b64 $L__tmp33
|
685 |
+
.b8 3
|
686 |
+
.b8 46
|
687 |
+
.b8 45
|
688 |
+
.b8 0
|
689 |
+
.b8 0
|
690 |
+
}
|
691 |
+
.section .debug_pubnames
|
692 |
+
{
|
693 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
694 |
+
$L__pubNames_start0:
|
695 |
+
.b8 2
|
696 |
+
.b8 0
|
697 |
+
.b32 .debug_info
|
698 |
+
.b32 407
|
699 |
+
.b32 125
|
700 |
+
.b8 116
|
701 |
+
.b8 114
|
702 |
+
.b8 105
|
703 |
+
.b8 116
|
704 |
+
.b8 111
|
705 |
+
.b8 110
|
706 |
+
.b8 95
|
707 |
+
.b8 95
|
708 |
+
.b8 48
|
709 |
+
.b8 100
|
710 |
+
.b8 49
|
711 |
+
.b8 100
|
712 |
+
.b8 50
|
713 |
+
.b8 100
|
714 |
+
.b8 51
|
715 |
+
.b8 100
|
716 |
+
.b8 52
|
717 |
+
.b8 100
|
718 |
+
.b8 53
|
719 |
+
.b8 100
|
720 |
+
.b8 54
|
721 |
+
.b8 100
|
722 |
+
.b8 55
|
723 |
+
.b8 100
|
724 |
+
.b8 101
|
725 |
+
.b8 56
|
726 |
+
.b8 100
|
727 |
+
.b8 101
|
728 |
+
.b8 0
|
729 |
+
.b32 0
|
730 |
+
$L__pubNames_end0:
|
731 |
+
}
|
732 |
+
.section .debug_pubtypes
|
733 |
+
{
|
734 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
735 |
+
$L__pubTypes_start0:
|
736 |
+
.b8 2
|
737 |
+
.b8 0
|
738 |
+
.b32 .debug_info
|
739 |
+
.b32 407
|
740 |
+
.b32 0
|
741 |
+
$L__pubTypes_end0:
|
742 |
+
}
|
743 |
+
.section .debug_loc { }
|
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
7 |
+
%cst_2 = arith.constant dense<2.560000e+02> : tensor<256xf32>
|
8 |
+
%cst_3 = arith.constant dense<2.560000e+02> : tensor<1xf32>
|
9 |
+
%cst_4 = arith.constant dense<256> : tensor<256xi32>
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
12 |
+
%2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
|
13 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
14 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
15 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
16 |
+
%6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
17 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
18 |
+
%8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
19 |
+
%9 = arith.extf %8 : tensor<256xbf16> to tensor<256xf32>
|
20 |
+
%10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
21 |
+
%11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
22 |
+
%12 = tt.load %11, %2, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
23 |
+
%13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
24 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
25 |
+
%15 = tt.load %14, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
26 |
+
%16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
|
27 |
+
%17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
28 |
+
%18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
|
29 |
+
%19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
|
30 |
+
%20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
31 |
+
%21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
|
32 |
+
%22 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
33 |
+
%23 = tt.addptr %22, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
34 |
+
%24 = tt.load %23, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
35 |
+
%25 = arith.mulf %9, %12 : tensor<256xf32>
|
36 |
+
%26 = arith.select %2, %25, %cst_1 : tensor<256xi1>, tensor<256xf32>
|
37 |
+
%27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({
|
38 |
+
^bb0(%arg9: f32, %arg10: f32):
|
39 |
+
%50 = arith.addf %arg9, %arg10 : f32
|
40 |
+
tt.reduce.return %50 : f32
|
41 |
+
}) : (tensor<256xf32>) -> f32
|
42 |
+
%28 = arith.addf %27, %cst_0 : f32
|
43 |
+
%29 = tt.broadcast %18 : (tensor<1xf32>) -> tensor<256xf32>
|
44 |
+
%30 = arith.subf %15, %29 : tensor<256xf32>
|
45 |
+
%31 = tt.broadcast %21 : (tensor<1xf32>) -> tensor<256xf32>
|
46 |
+
%32 = arith.mulf %30, %31 : tensor<256xf32>
|
47 |
+
%33 = arith.mulf %25, %32 : tensor<256xf32>
|
48 |
+
%34 = arith.select %2, %33, %cst_1 : tensor<256xi1>, tensor<256xf32>
|
49 |
+
%35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
|
50 |
+
^bb0(%arg9: f32, %arg10: f32):
|
51 |
+
%50 = arith.addf %arg9, %arg10 : f32
|
52 |
+
tt.reduce.return %50 : f32
|
53 |
+
}) : (tensor<256xf32>) -> f32
|
54 |
+
%36 = arith.addf %35, %cst_0 : f32
|
55 |
+
%37 = arith.divf %21, %cst_3 : tensor<1xf32>
|
56 |
+
%38 = arith.mulf %25, %cst_2 : tensor<256xf32>
|
57 |
+
%39 = tt.splat %28 : (f32) -> tensor<256xf32>
|
58 |
+
%40 = arith.subf %38, %39 : tensor<256xf32>
|
59 |
+
%41 = tt.splat %36 : (f32) -> tensor<256xf32>
|
60 |
+
%42 = arith.mulf %32, %41 : tensor<256xf32>
|
61 |
+
%43 = arith.subf %40, %42 : tensor<256xf32>
|
62 |
+
%44 = tt.broadcast %37 : (tensor<1xf32>) -> tensor<256xf32>
|
63 |
+
%45 = arith.mulf %44, %43 : tensor<256xf32>
|
64 |
+
%46 = arith.addf %24, %45 : tensor<256xf32>
|
65 |
+
tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
|
66 |
+
%47 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
67 |
+
%48 = tt.addptr %47, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
68 |
+
%49 = arith.truncf %46 : tensor<256xf32> to tensor<256xbf16>
|
69 |
+
tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
70 |
+
tt.return
|
71 |
+
}
|
72 |
+
}
|
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin
ADDED
Binary file (10.3 kB). View file
|
|
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
|
7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%7 = and i32 %6, 63, !dbg !8
|
9 |
+
%8 = lshr i32 %6, 6, !dbg !9
|
10 |
+
%9 = and i32 %8, 3, !dbg !9
|
11 |
+
%10 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
|
12 |
+
%11 = shl i32 %10, 6, !dbg !11
|
13 |
+
%12 = or i32 %11, %7, !dbg !12
|
14 |
+
br label %13, !dbg !13
|
15 |
+
|
16 |
+
13: ; preds = %5, %13
|
17 |
+
%14 = phi float [ 0.000000e+00, %5 ], [ %23, %13 ]
|
18 |
+
%15 = phi i32 [ 0, %5 ], [ %24, %13 ]
|
19 |
+
%16 = or i32 %15, %9, !dbg !14
|
20 |
+
%17 = shl i32 %16, 17, !dbg !15
|
21 |
+
%18 = add i32 %17, %12, !dbg !16
|
22 |
+
%19 = sext i32 %18 to i64, !dbg !17
|
23 |
+
%20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !17
|
24 |
+
%21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true) #3, !dbg !18
|
25 |
+
%22 = bitcast i32 %21 to float, !dbg !18
|
26 |
+
%23 = fadd float %14, %22, !dbg !19
|
27 |
+
%24 = add nuw nsw i32 %15, 4, !dbg !13
|
28 |
+
%25 = icmp ult i32 %15, 116, !dbg !13
|
29 |
+
br i1 %25, label %13, label %26, !dbg !13
|
30 |
+
|
31 |
+
26: ; preds = %13
|
32 |
+
%27 = shl nuw nsw i32 %7, 2, !dbg !20
|
33 |
+
%28 = or i32 %27, %9, !dbg !20
|
34 |
+
%29 = zext nneg i32 %28 to i64, !dbg !20
|
35 |
+
%30 = getelementptr float, ptr addrspace(3) @global_smem, i64 %29, !dbg !20
|
36 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %30, float %23, i1 true) #3, !dbg !20
|
37 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !20
|
38 |
+
%31 = icmp slt i32 %6, 256, !dbg !20
|
39 |
+
%32 = sext i32 %6 to i64, !dbg !20
|
40 |
+
%33 = getelementptr float, ptr addrspace(3) @global_smem, i64 %32, !dbg !20
|
41 |
+
%34 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %33, i1 %31) #3, !dbg !20
|
42 |
+
%35 = bitcast float %34 to i32, !dbg !20
|
43 |
+
%36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !20
|
44 |
+
%37 = bitcast i32 %36 to float, !dbg !20
|
45 |
+
%38 = fadd float %34, %37, !dbg !24
|
46 |
+
%39 = bitcast float %38 to i32, !dbg !20
|
47 |
+
%40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !20
|
48 |
+
%41 = bitcast i32 %40 to float, !dbg !20
|
49 |
+
%42 = fadd float %38, %41, !dbg !24
|
50 |
+
%43 = and i32 %6, 3, !dbg !20
|
51 |
+
%44 = icmp eq i32 %43, 0, !dbg !20
|
52 |
+
%45 = and i1 %31, %44, !dbg !20
|
53 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %33, float %42, i1 %45) #3, !dbg !20
|
54 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !20
|
55 |
+
%46 = zext nneg i32 %27 to i64, !dbg !20
|
56 |
+
%47 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46, !dbg !20
|
57 |
+
%48 = load float, ptr addrspace(3) %47, align 4, !dbg !20
|
58 |
+
%.frozen = freeze i32 %12
|
59 |
+
%49 = sdiv i32 %.frozen, 256, !dbg !28
|
60 |
+
%50 = mul i32 %49, 256
|
61 |
+
%.decomposed = sub i32 %.frozen, %50
|
62 |
+
%51 = sext i32 %49 to i64, !dbg !29
|
63 |
+
%52 = getelementptr i64, ptr addrspace(1) %1, i64 %51, !dbg !29
|
64 |
+
%53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %52, i1 true) #3, !dbg !30
|
65 |
+
%54 = lshr i64 %53, 54, !dbg !31
|
66 |
+
%55 = and i64 %54, 512, !dbg !31
|
67 |
+
%56 = add i64 %55, %53, !dbg !31
|
68 |
+
%57 = shl i64 %56, 8, !dbg !32
|
69 |
+
%58 = sext i32 %.decomposed to i64, !dbg !33
|
70 |
+
%59 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !34
|
71 |
+
%60 = getelementptr float, ptr addrspace(1) %59, i64 %58, !dbg !34
|
72 |
+
%61 = icmp eq i32 %9, 0, !dbg !35
|
73 |
+
%62 = insertelement <1 x float> undef, float %48, i64 0, !dbg !35
|
74 |
+
%63 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %60, <1 x float> %62, i1 %61) #3, !dbg !35
|
75 |
+
ret void, !dbg !36
|
76 |
+
}
|
77 |
+
|
78 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
79 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
80 |
+
|
81 |
+
; Function Attrs: convergent nocallback nounwind
|
82 |
+
declare void @llvm.nvvm.barrier0() #1
|
83 |
+
|
84 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
85 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
86 |
+
|
87 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
88 |
+
attributes #1 = { convergent nocallback nounwind }
|
89 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
90 |
+
attributes #3 = { nounwind }
|
91 |
+
|
92 |
+
!llvm.module.flags = !{!0}
|
93 |
+
!llvm.dbg.cu = !{!1}
|
94 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
95 |
+
|
96 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
97 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
98 |
+
!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
|
99 |
+
!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
|
100 |
+
!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
|
101 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
102 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
103 |
+
!7 = !{}
|
104 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
105 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
106 |
+
!10 = !DILocation(line: 21, column: 28, scope: !5)
|
107 |
+
!11 = !DILocation(line: 21, column: 33, scope: !5)
|
108 |
+
!12 = !DILocation(line: 22, column: 23, scope: !5)
|
109 |
+
!13 = !DILocation(line: 27, column: 36, scope: !5)
|
110 |
+
!14 = !DILocation(line: 28, column: 27, scope: !5)
|
111 |
+
!15 = !DILocation(line: 31, column: 47, scope: !5)
|
112 |
+
!16 = !DILocation(line: 31, column: 40, scope: !5)
|
113 |
+
!17 = !DILocation(line: 31, column: 34, scope: !5)
|
114 |
+
!18 = !DILocation(line: 31, column: 53, scope: !5)
|
115 |
+
!19 = !DILocation(line: 34, column: 38, scope: !5)
|
116 |
+
!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
|
117 |
+
!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
|
118 |
+
!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
119 |
+
!23 = !DILocation(line: 35, column: 25, scope: !21)
|
120 |
+
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
|
121 |
+
!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
|
122 |
+
!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
|
123 |
+
!27 = !DILocation(line: 35, column: 25, scope: !25)
|
124 |
+
!28 = !DILocation(line: 36, column: 20, scope: !5)
|
125 |
+
!29 = !DILocation(line: 38, column: 30, scope: !5)
|
126 |
+
!30 = !DILocation(line: 38, column: 35, scope: !5)
|
127 |
+
!31 = !DILocation(line: 41, column: 32, scope: !5)
|
128 |
+
!32 = !DILocation(line: 45, column: 40, scope: !5)
|
129 |
+
!33 = !DILocation(line: 45, column: 36, scope: !5)
|
130 |
+
!34 = !DILocation(line: 45, column: 30, scope: !5)
|
131 |
+
!35 = !DILocation(line: 45, column: 55, scope: !5)
|
132 |
+
!36 = !DILocation(line: 45, column: 4, scope: !5)
|
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
6 |
+
|
7 |
+
define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
|
8 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
9 |
+
%9 = and i32 %8, 31, !dbg !10
|
10 |
+
%10 = lshr i32 %8, 5, !dbg !10
|
11 |
+
%11 = and i32 %10, 1, !dbg !10
|
12 |
+
%urem = shl i32 %8, 2, !dbg !10
|
13 |
+
%12 = and i32 %urem, 252, !dbg !10
|
14 |
+
%13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
|
15 |
+
%14 = shl i32 %13, 8, !dbg !12
|
16 |
+
%15 = or i32 %14, %12, !dbg !13
|
17 |
+
%16 = sext i32 %15 to i64, !dbg !14
|
18 |
+
%17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !14
|
19 |
+
%18 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %17, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
|
20 |
+
%19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !15
|
21 |
+
%20 = extractvalue { i32, i32, i32, i32 } %18, 1, !dbg !15
|
22 |
+
%21 = extractvalue { i32, i32, i32, i32 } %18, 2, !dbg !15
|
23 |
+
%22 = extractvalue { i32, i32, i32, i32 } %18, 3, !dbg !15
|
24 |
+
%23 = bitcast i32 %21 to float, !dbg !15
|
25 |
+
%24 = bitcast i32 %22 to float, !dbg !15
|
26 |
+
%25 = getelementptr i16, ptr addrspace(1) %1, i64 %16, !dbg !16
|
27 |
+
%26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
|
28 |
+
%27 = extractvalue { i32, i32 } %26, 0, !dbg !17
|
29 |
+
%28 = extractvalue { i32, i32 } %26, 1, !dbg !17
|
30 |
+
%29 = trunc i32 %27 to i16, !dbg !17
|
31 |
+
%extelt.offset = lshr i32 %27, 16, !dbg !17
|
32 |
+
%30 = trunc i32 %extelt.offset to i16, !dbg !17
|
33 |
+
%31 = trunc i32 %28 to i16, !dbg !17
|
34 |
+
%extelt.offset1 = lshr i32 %28, 16, !dbg !17
|
35 |
+
%32 = trunc i32 %extelt.offset1 to i16, !dbg !17
|
36 |
+
%33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #6, !dbg !18
|
37 |
+
%34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
|
38 |
+
%35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
|
39 |
+
%36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
|
40 |
+
%37 = getelementptr i16, ptr addrspace(1) %2, i64 %16, !dbg !19
|
41 |
+
%38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
|
42 |
+
%39 = extractvalue { i32, i32 } %38, 0, !dbg !20
|
43 |
+
%40 = extractvalue { i32, i32 } %38, 1, !dbg !20
|
44 |
+
%41 = trunc i32 %39 to i16, !dbg !20
|
45 |
+
%extelt.offset2 = lshr i32 %39, 16, !dbg !20
|
46 |
+
%42 = trunc i32 %extelt.offset2 to i16, !dbg !20
|
47 |
+
%43 = trunc i32 %40 to i16, !dbg !20
|
48 |
+
%extelt.offset3 = lshr i32 %40, 16, !dbg !20
|
49 |
+
%44 = trunc i32 %extelt.offset3 to i16, !dbg !20
|
50 |
+
%45 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #6, !dbg !21
|
51 |
+
%46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
|
52 |
+
%47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
|
53 |
+
%48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
|
54 |
+
%49 = zext nneg i32 %12 to i64, !dbg !22
|
55 |
+
%50 = getelementptr float, ptr addrspace(1) %3, i64 %49, !dbg !22
|
56 |
+
%51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
|
57 |
+
%52 = fadd float %35, %23, !dbg !24
|
58 |
+
%53 = fadd float %36, %24, !dbg !24
|
59 |
+
%54 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !15
|
60 |
+
%55 = insertelement <2 x i32> %54, i32 %20, i64 1, !dbg !15
|
61 |
+
%56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !15
|
62 |
+
%57 = insertelement <2 x float> poison, float %33, i64 0, !dbg !24
|
63 |
+
%58 = insertelement <2 x float> %57, float %34, i64 1, !dbg !24
|
64 |
+
%59 = fadd <2 x float> %58, %56, !dbg !24
|
65 |
+
%60 = insertelement <2 x float> poison, float %45, i64 0, !dbg !25
|
66 |
+
%61 = insertelement <2 x float> %60, float %46, i64 1, !dbg !25
|
67 |
+
%62 = fadd <2 x float> %59, %61, !dbg !25
|
68 |
+
%63 = fadd float %52, %47, !dbg !25
|
69 |
+
%64 = fadd float %53, %48, !dbg !25
|
70 |
+
%65 = extractelement <2 x float> %62, i64 0, !dbg !26
|
71 |
+
%66 = extractelement <2 x float> %62, i64 1, !dbg !26
|
72 |
+
%67 = fadd float %65, %66, !dbg !26
|
73 |
+
%68 = fadd float %67, %63, !dbg !26
|
74 |
+
%69 = fadd float %68, %64, !dbg !26
|
75 |
+
%70 = bitcast float %69 to i32, !dbg !32
|
76 |
+
%71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32
|
77 |
+
%72 = bitcast i32 %71 to float, !dbg !32
|
78 |
+
%73 = fadd float %69, %72, !dbg !26
|
79 |
+
%74 = bitcast float %73 to i32, !dbg !32
|
80 |
+
%75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32
|
81 |
+
%76 = bitcast i32 %75 to float, !dbg !32
|
82 |
+
%77 = fadd float %73, %76, !dbg !26
|
83 |
+
%78 = bitcast float %77 to i32, !dbg !32
|
84 |
+
%79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32
|
85 |
+
%80 = bitcast i32 %79 to float, !dbg !32
|
86 |
+
%81 = fadd float %77, %80, !dbg !26
|
87 |
+
%82 = bitcast float %81 to i32, !dbg !32
|
88 |
+
%83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32
|
89 |
+
%84 = bitcast i32 %83 to float, !dbg !32
|
90 |
+
%85 = fadd float %81, %84, !dbg !26
|
91 |
+
%86 = bitcast float %85 to i32, !dbg !32
|
92 |
+
%87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32
|
93 |
+
%88 = bitcast i32 %87 to float, !dbg !32
|
94 |
+
%89 = fadd float %85, %88, !dbg !26
|
95 |
+
%90 = icmp eq i32 %9, 0, !dbg !32
|
96 |
+
%91 = zext nneg i32 %11 to i64, !dbg !32
|
97 |
+
%92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32
|
98 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #6, !dbg !32
|
99 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
100 |
+
%93 = icmp slt i32 %8, 2, !dbg !32
|
101 |
+
%94 = sext i32 %8 to i64, !dbg !32
|
102 |
+
%95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32
|
103 |
+
%96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !32
|
104 |
+
%97 = bitcast float %96 to i32, !dbg !32
|
105 |
+
%98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 1, i32 31), !dbg !32
|
106 |
+
%99 = bitcast i32 %98 to float, !dbg !32
|
107 |
+
%100 = fadd float %96, %99, !dbg !26
|
108 |
+
%101 = and i32 %8, 1, !dbg !32
|
109 |
+
%102 = icmp eq i32 %101, 0, !dbg !32
|
110 |
+
%103 = and i1 %93, %102, !dbg !32
|
111 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %100, i1 %103) #6, !dbg !32
|
112 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
113 |
+
%104 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
|
114 |
+
%105 = fadd float %104, 0.000000e+00, !dbg !34
|
115 |
+
%106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %105, float 2.560000e+02) #6, !dbg !38
|
116 |
+
%107 = fsub float %65, %106, !dbg !39
|
117 |
+
%108 = fsub float %66, %106, !dbg !39
|
118 |
+
%109 = fsub float %63, %106, !dbg !39
|
119 |
+
%110 = fsub float %64, %106, !dbg !39
|
120 |
+
%111 = fmul float %107, %107, !dbg !40
|
121 |
+
%112 = fmul float %108, %108, !dbg !40
|
122 |
+
%113 = fmul float %109, %109, !dbg !40
|
123 |
+
%114 = fmul float %110, %110, !dbg !40
|
124 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
125 |
+
%115 = fadd float %111, %112, !dbg !43
|
126 |
+
%116 = fadd float %113, %115, !dbg !43
|
127 |
+
%117 = fadd float %114, %116, !dbg !43
|
128 |
+
%118 = bitcast float %117 to i32, !dbg !41
|
129 |
+
%119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 16, i32 31), !dbg !41
|
130 |
+
%120 = bitcast i32 %119 to float, !dbg !41
|
131 |
+
%121 = fadd float %117, %120, !dbg !43
|
132 |
+
%122 = bitcast float %121 to i32, !dbg !41
|
133 |
+
%123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 8, i32 31), !dbg !41
|
134 |
+
%124 = bitcast i32 %123 to float, !dbg !41
|
135 |
+
%125 = fadd float %121, %124, !dbg !43
|
136 |
+
%126 = bitcast float %125 to i32, !dbg !41
|
137 |
+
%127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 4, i32 31), !dbg !41
|
138 |
+
%128 = bitcast i32 %127 to float, !dbg !41
|
139 |
+
%129 = fadd float %125, %128, !dbg !43
|
140 |
+
%130 = bitcast float %129 to i32, !dbg !41
|
141 |
+
%131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !41
|
142 |
+
%132 = bitcast i32 %131 to float, !dbg !41
|
143 |
+
%133 = fadd float %129, %132, !dbg !43
|
144 |
+
%134 = bitcast float %133 to i32, !dbg !41
|
145 |
+
%135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 1, i32 31), !dbg !41
|
146 |
+
%136 = bitcast i32 %135 to float, !dbg !41
|
147 |
+
%137 = fadd float %133, %136, !dbg !43
|
148 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %137, i1 %90) #6, !dbg !41
|
149 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
150 |
+
%138 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !41
|
151 |
+
%139 = bitcast float %138 to i32, !dbg !41
|
152 |
+
%140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !41
|
153 |
+
%141 = bitcast i32 %140 to float, !dbg !41
|
154 |
+
%142 = fadd float %138, %141, !dbg !43
|
155 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %142, i1 %103) #6, !dbg !41
|
156 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
157 |
+
%143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
|
158 |
+
%144 = fadd float %143, 0.000000e+00, !dbg !46
|
159 |
+
%145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !48
|
160 |
+
%146 = fadd float %145, 0x3EE4F8B580000000, !dbg !49
|
161 |
+
%147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50
|
162 |
+
%.not.i = icmp eq i32 %147, 0, !dbg !50
|
163 |
+
br i1 %.not.i, label %150, label %148, !dbg !50
|
164 |
+
|
165 |
+
148: ; preds = %7
|
166 |
+
%149 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %146), !dbg !50
|
167 |
+
br label %__nv_rsqrtf.exit, !dbg !50
|
168 |
+
|
169 |
+
150: ; preds = %7
|
170 |
+
%151 = tail call float @llvm.nvvm.rsqrt.approx.f(float %146), !dbg !50
|
171 |
+
br label %__nv_rsqrtf.exit, !dbg !50
|
172 |
+
|
173 |
+
__nv_rsqrtf.exit: ; preds = %148, %150
|
174 |
+
%.0.i = phi float [ %149, %148 ], [ %151, %150 ], !dbg !50
|
175 |
+
%152 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !23
|
176 |
+
%153 = bitcast i32 %152 to float, !dbg !23
|
177 |
+
%154 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !23
|
178 |
+
%155 = bitcast i32 %154 to float, !dbg !23
|
179 |
+
%156 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !23
|
180 |
+
%157 = bitcast i32 %156 to float, !dbg !23
|
181 |
+
%158 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !23
|
182 |
+
%159 = bitcast i32 %158 to float, !dbg !23
|
183 |
+
%160 = fmul float %107, %.0.i, !dbg !51
|
184 |
+
%161 = fmul float %108, %.0.i, !dbg !51
|
185 |
+
%162 = fmul float %109, %.0.i, !dbg !51
|
186 |
+
%163 = fmul float %110, %.0.i, !dbg !51
|
187 |
+
%164 = fmul float %160, %159, !dbg !52
|
188 |
+
%165 = fmul float %161, %157, !dbg !52
|
189 |
+
%166 = fmul float %162, %155, !dbg !52
|
190 |
+
%167 = fmul float %163, %153, !dbg !52
|
191 |
+
%168 = getelementptr float, ptr addrspace(1) %4, i64 %16, !dbg !53
|
192 |
+
%169 = bitcast float %164 to i32, !dbg !54
|
193 |
+
%170 = bitcast float %165 to i32, !dbg !54
|
194 |
+
%171 = bitcast float %166 to i32, !dbg !54
|
195 |
+
%172 = bitcast float %167 to i32, !dbg !54
|
196 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %169, i32 %170, i32 %171, i32 %172, ptr addrspace(1) %168, i1 true) #6, !dbg !54
|
197 |
+
ret void, !dbg !55
|
198 |
+
}
|
199 |
+
|
200 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
201 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
202 |
+
|
203 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
204 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
205 |
+
|
206 |
+
; Function Attrs: convergent nocallback nounwind
|
207 |
+
declare void @llvm.nvvm.barrier0() #2
|
208 |
+
|
209 |
+
; Function Attrs: alwaysinline nounwind
|
210 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
211 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
212 |
+
%.not = icmp eq i32 %1, 0
|
213 |
+
br i1 %.not, label %4, label %2
|
214 |
+
|
215 |
+
2: ; preds = %0
|
216 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
217 |
+
br label %6
|
218 |
+
|
219 |
+
4: ; preds = %0
|
220 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
221 |
+
br label %6
|
222 |
+
|
223 |
+
6: ; preds = %4, %2
|
224 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
225 |
+
ret float %.0
|
226 |
+
}
|
227 |
+
|
228 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
229 |
+
|
230 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
231 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
232 |
+
|
233 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
234 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
235 |
+
|
236 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
237 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
238 |
+
attributes #2 = { convergent nocallback nounwind }
|
239 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
240 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
241 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
242 |
+
attributes #6 = { nounwind }
|
243 |
+
|
244 |
+
!llvm.module.flags = !{!0, !1}
|
245 |
+
!llvm.dbg.cu = !{!2}
|
246 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
247 |
+
!llvm.ident = !{!6}
|
248 |
+
|
249 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
250 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
251 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
252 |
+
!3 = !DIFile(filename: "cpedrbcgvftrmo3x6vfpo6dhkxbweq3ucfj5jibyyvr3hf67gsvx.py", directory: "/tmp/torchinductor_root/pe")
|
253 |
+
!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
|
254 |
+
!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64}
|
255 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
256 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
257 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
258 |
+
!9 = !{}
|
259 |
+
!10 = !DILocation(line: 26, column: 26, scope: !7)
|
260 |
+
!11 = !DILocation(line: 23, column: 28, scope: !7)
|
261 |
+
!12 = !DILocation(line: 30, column: 40, scope: !7)
|
262 |
+
!13 = !DILocation(line: 30, column: 36, scope: !7)
|
263 |
+
!14 = !DILocation(line: 30, column: 30, scope: !7)
|
264 |
+
!15 = !DILocation(line: 30, column: 46, scope: !7)
|
265 |
+
!16 = !DILocation(line: 31, column: 30, scope: !7)
|
266 |
+
!17 = !DILocation(line: 31, column: 46, scope: !7)
|
267 |
+
!18 = !DILocation(line: 31, column: 67, scope: !7)
|
268 |
+
!19 = !DILocation(line: 32, column: 30, scope: !7)
|
269 |
+
!20 = !DILocation(line: 32, column: 46, scope: !7)
|
270 |
+
!21 = !DILocation(line: 32, column: 67, scope: !7)
|
271 |
+
!22 = !DILocation(line: 33, column: 31, scope: !7)
|
272 |
+
!23 = !DILocation(line: 33, column: 36, scope: !7)
|
273 |
+
!24 = !DILocation(line: 35, column: 18, scope: !7)
|
274 |
+
!25 = !DILocation(line: 37, column: 18, scope: !7)
|
275 |
+
!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
|
276 |
+
!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
|
277 |
+
!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
278 |
+
!29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0)
|
279 |
+
!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
|
280 |
+
!31 = !DILocation(line: 42, column: 59, scope: !27)
|
281 |
+
!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
|
282 |
+
!33 = !DILocation(line: 42, column: 59, scope: !29)
|
283 |
+
!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
|
284 |
+
!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
|
285 |
+
!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
286 |
+
!37 = !DILocation(line: 42, column: 45, scope: !35)
|
287 |
+
!38 = !DILocation(line: 45, column: 20, scope: !7)
|
288 |
+
!39 = !DILocation(line: 46, column: 19, scope: !7)
|
289 |
+
!40 = !DILocation(line: 47, column: 20, scope: !7)
|
290 |
+
!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
|
291 |
+
!42 = !DILocation(line: 50, column: 59, scope: !29)
|
292 |
+
!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
|
293 |
+
!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
|
294 |
+
!45 = !DILocation(line: 50, column: 59, scope: !27)
|
295 |
+
!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
|
296 |
+
!47 = !DILocation(line: 50, column: 45, scope: !35)
|
297 |
+
!48 = !DILocation(line: 53, column: 20, scope: !7)
|
298 |
+
!49 = !DILocation(line: 55, column: 20, scope: !7)
|
299 |
+
!50 = !DILocation(line: 56, column: 26, scope: !7)
|
300 |
+
!51 = !DILocation(line: 57, column: 20, scope: !7)
|
301 |
+
!52 = !DILocation(line: 58, column: 20, scope: !7)
|
302 |
+
!53 = !DILocation(line: 59, column: 25, scope: !7)
|
303 |
+
!54 = !DILocation(line: 59, column: 48, scope: !7)
|
304 |
+
!55 = !DILocation(line: 59, column: 4, scope: !7)
|
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant 9.99999974E-6 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
8 |
+
%c256_i32 = arith.constant 256 : i32
|
9 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
20 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
21 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
22 |
+
%11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
23 |
+
%12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
27 |
+
%16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
28 |
+
%17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
29 |
+
%18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
30 |
+
%19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
31 |
+
%20 = arith.addf %8, %12 : tensor<256xf32, #blocked>
|
32 |
+
%21 = arith.addf %20, %16 : tensor<256xf32, #blocked>
|
33 |
+
%22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
34 |
+
%23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
|
35 |
+
^bb0(%arg7: f32, %arg8: f32):
|
36 |
+
%40 = arith.addf %arg7, %arg8 : f32
|
37 |
+
tt.reduce.return %40 : f32
|
38 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
39 |
+
%24 = arith.addf %23, %cst_2 : f32
|
40 |
+
%25 = arith.divf %24, %cst_1 : f32
|
41 |
+
%26 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
|
42 |
+
%27 = arith.subf %21, %26 : tensor<256xf32, #blocked>
|
43 |
+
%28 = arith.mulf %27, %27 : tensor<256xf32, #blocked>
|
44 |
+
%29 = arith.select %2, %28, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
45 |
+
%30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
|
46 |
+
^bb0(%arg7: f32, %arg8: f32):
|
47 |
+
%40 = arith.addf %arg7, %arg8 : f32
|
48 |
+
tt.reduce.return %40 : f32
|
49 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
50 |
+
%31 = arith.addf %30, %cst_2 : f32
|
51 |
+
%32 = arith.divf %31, %cst_1 : f32
|
52 |
+
%33 = arith.addf %32, %cst_0 : f32
|
53 |
+
%34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
54 |
+
%35 = tt.splat %34 : (f32) -> tensor<256xf32, #blocked>
|
55 |
+
%36 = arith.mulf %27, %35 : tensor<256xf32, #blocked>
|
56 |
+
%37 = arith.mulf %36, %19 : tensor<256xf32, #blocked>
|
57 |
+
%38 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
58 |
+
%39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
59 |
+
tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
60 |
+
tt.return
|
61 |
+
}
|
62 |
+
}
|
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 9.99999974E-6 : f32
|
8 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
9 |
+
%cst_4 = arith.constant dense<256> : tensor<256xi32>
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
12 |
+
%2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
|
13 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
14 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
15 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
16 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
17 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
18 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
19 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
20 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
21 |
+
%11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
22 |
+
%12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
|
23 |
+
%13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
24 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
25 |
+
%15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
26 |
+
%16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
|
27 |
+
%17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
28 |
+
%18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
29 |
+
%19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
30 |
+
%20 = arith.addf %8, %12 : tensor<256xf32>
|
31 |
+
%21 = arith.addf %20, %16 : tensor<256xf32>
|
32 |
+
%22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
33 |
+
%23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
|
34 |
+
^bb0(%arg7: f32, %arg8: f32):
|
35 |
+
%40 = arith.addf %arg7, %arg8 : f32
|
36 |
+
tt.reduce.return %40 : f32
|
37 |
+
}) : (tensor<256xf32>) -> f32
|
38 |
+
%24 = arith.addf %23, %cst_0 : f32
|
39 |
+
%25 = arith.divf %24, %cst_1 : f32
|
40 |
+
%26 = tt.splat %25 : (f32) -> tensor<256xf32>
|
41 |
+
%27 = arith.subf %21, %26 : tensor<256xf32>
|
42 |
+
%28 = arith.mulf %27, %27 : tensor<256xf32>
|
43 |
+
%29 = arith.select %2, %28, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
44 |
+
%30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
|
45 |
+
^bb0(%arg7: f32, %arg8: f32):
|
46 |
+
%40 = arith.addf %arg7, %arg8 : f32
|
47 |
+
tt.reduce.return %40 : f32
|
48 |
+
}) : (tensor<256xf32>) -> f32
|
49 |
+
%31 = arith.addf %30, %cst_0 : f32
|
50 |
+
%32 = arith.divf %31, %cst_1 : f32
|
51 |
+
%33 = arith.addf %32, %cst_2 : f32
|
52 |
+
%34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
53 |
+
%35 = tt.splat %34 : (f32) -> tensor<256xf32>
|
54 |
+
%36 = arith.mulf %27, %35 : tensor<256xf32>
|
55 |
+
%37 = arith.mulf %36, %19 : tensor<256xf32>
|
56 |
+
%38 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
57 |
+
%39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
58 |
+
tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
|
59 |
+
tt.return
|
60 |
+
}
|
61 |
+
}
|
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.llir
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2d3d4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4) local_unnamed_addr !dbg !5 {
|
5 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%7 = shl i32 %6, 3, !dbg !8
|
7 |
+
%8 = and i32 %7, 1016, !dbg !8
|
8 |
+
%9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%10 = shl i32 %9, 10, !dbg !10
|
10 |
+
%11 = or i32 %10, %8, !dbg !11
|
11 |
+
%.frozen = freeze i32 %11
|
12 |
+
%12 = sdiv i32 %.frozen, 256, !dbg !12
|
13 |
+
%13 = srem i32 %12, 3, !dbg !13
|
14 |
+
%14 = mul i32 %12, 256
|
15 |
+
%.decomposed = sub i32 %.frozen, %14
|
16 |
+
%15 = sdiv i32 %11, 768, !dbg !14
|
17 |
+
%16 = shl nsw i32 %15, 8, !dbg !15
|
18 |
+
%17 = add nsw i32 %16, %.decomposed, !dbg !16
|
19 |
+
%18 = sext i32 %17 to i64, !dbg !17
|
20 |
+
%19 = getelementptr i16, ptr addrspace(1) %0, i64 %18, !dbg !17
|
21 |
+
%20 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %19, i1 true) #1, !dbg !18
|
22 |
+
%21 = extractvalue { i32, i32, i32, i32 } %20, 0, !dbg !18
|
23 |
+
%22 = extractvalue { i32, i32, i32, i32 } %20, 1, !dbg !18
|
24 |
+
%23 = extractvalue { i32, i32, i32, i32 } %20, 2, !dbg !18
|
25 |
+
%24 = extractvalue { i32, i32, i32, i32 } %20, 3, !dbg !18
|
26 |
+
%25 = trunc i32 %21 to i16, !dbg !18
|
27 |
+
%extelt.offset = lshr i32 %21, 16, !dbg !18
|
28 |
+
%26 = trunc i32 %extelt.offset to i16, !dbg !18
|
29 |
+
%27 = trunc i32 %22 to i16, !dbg !18
|
30 |
+
%extelt.offset1 = lshr i32 %22, 16, !dbg !18
|
31 |
+
%28 = trunc i32 %extelt.offset1 to i16, !dbg !18
|
32 |
+
%29 = trunc i32 %23 to i16, !dbg !18
|
33 |
+
%extelt.offset2 = lshr i32 %23, 16, !dbg !18
|
34 |
+
%30 = trunc i32 %extelt.offset2 to i16, !dbg !18
|
35 |
+
%31 = trunc i32 %24 to i16, !dbg !18
|
36 |
+
%extelt.offset3 = lshr i32 %24, 16, !dbg !18
|
37 |
+
%32 = trunc i32 %extelt.offset3 to i16, !dbg !18
|
38 |
+
%33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #1, !dbg !19
|
39 |
+
%34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #1, !dbg !19
|
40 |
+
%35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #1, !dbg !19
|
41 |
+
%36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %28) #1, !dbg !19
|
42 |
+
%37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #1, !dbg !19
|
43 |
+
%38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #1, !dbg !19
|
44 |
+
%39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #1, !dbg !19
|
45 |
+
%40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #1, !dbg !19
|
46 |
+
%41 = getelementptr i16, ptr addrspace(1) %1, i64 %18, !dbg !20
|
47 |
+
%42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %41, i1 true) #1, !dbg !21
|
48 |
+
%43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !21
|
49 |
+
%44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !21
|
50 |
+
%45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !21
|
51 |
+
%46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !21
|
52 |
+
%47 = trunc i32 %43 to i16, !dbg !21
|
53 |
+
%extelt.offset4 = lshr i32 %43, 16, !dbg !21
|
54 |
+
%48 = trunc i32 %extelt.offset4 to i16, !dbg !21
|
55 |
+
%49 = trunc i32 %44 to i16, !dbg !21
|
56 |
+
%extelt.offset5 = lshr i32 %44, 16, !dbg !21
|
57 |
+
%50 = trunc i32 %extelt.offset5 to i16, !dbg !21
|
58 |
+
%51 = trunc i32 %45 to i16, !dbg !21
|
59 |
+
%extelt.offset6 = lshr i32 %45, 16, !dbg !21
|
60 |
+
%52 = trunc i32 %extelt.offset6 to i16, !dbg !21
|
61 |
+
%53 = trunc i32 %46 to i16, !dbg !21
|
62 |
+
%extelt.offset7 = lshr i32 %46, 16, !dbg !21
|
63 |
+
%54 = trunc i32 %extelt.offset7 to i16, !dbg !21
|
64 |
+
%55 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #1, !dbg !22
|
65 |
+
%56 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #1, !dbg !22
|
66 |
+
%57 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #1, !dbg !22
|
67 |
+
%58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %50) #1, !dbg !22
|
68 |
+
%59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #1, !dbg !22
|
69 |
+
%60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %52) #1, !dbg !22
|
70 |
+
%61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %53) #1, !dbg !22
|
71 |
+
%62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #1, !dbg !22
|
72 |
+
%63 = getelementptr i16, ptr addrspace(1) %2, i64 %18, !dbg !23
|
73 |
+
%64 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %63, i1 true) #1, !dbg !24
|
74 |
+
%65 = extractvalue { i32, i32, i32, i32 } %64, 0, !dbg !24
|
75 |
+
%66 = extractvalue { i32, i32, i32, i32 } %64, 1, !dbg !24
|
76 |
+
%67 = extractvalue { i32, i32, i32, i32 } %64, 2, !dbg !24
|
77 |
+
%68 = extractvalue { i32, i32, i32, i32 } %64, 3, !dbg !24
|
78 |
+
%69 = trunc i32 %65 to i16, !dbg !24
|
79 |
+
%extelt.offset8 = lshr i32 %65, 16, !dbg !24
|
80 |
+
%70 = trunc i32 %extelt.offset8 to i16, !dbg !24
|
81 |
+
%71 = trunc i32 %66 to i16, !dbg !24
|
82 |
+
%extelt.offset9 = lshr i32 %66, 16, !dbg !24
|
83 |
+
%72 = trunc i32 %extelt.offset9 to i16, !dbg !24
|
84 |
+
%73 = trunc i32 %67 to i16, !dbg !24
|
85 |
+
%extelt.offset10 = lshr i32 %67, 16, !dbg !24
|
86 |
+
%74 = trunc i32 %extelt.offset10 to i16, !dbg !24
|
87 |
+
%75 = trunc i32 %68 to i16, !dbg !24
|
88 |
+
%extelt.offset11 = lshr i32 %68, 16, !dbg !24
|
89 |
+
%76 = trunc i32 %extelt.offset11 to i16, !dbg !24
|
90 |
+
%77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %69) #1, !dbg !25
|
91 |
+
%78 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #1, !dbg !25
|
92 |
+
%79 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #1, !dbg !25
|
93 |
+
%80 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #1, !dbg !25
|
94 |
+
%81 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #1, !dbg !25
|
95 |
+
%82 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #1, !dbg !25
|
96 |
+
%83 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %75) #1, !dbg !25
|
97 |
+
%84 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %76) #1, !dbg !25
|
98 |
+
%85 = icmp eq i32 %13, 2, !dbg !26
|
99 |
+
%86 = select i1 %85, float %33, float 0.000000e+00, !dbg !27
|
100 |
+
%87 = select i1 %85, float %34, float 0.000000e+00, !dbg !27
|
101 |
+
%88 = select i1 %85, float %35, float 0.000000e+00, !dbg !27
|
102 |
+
%89 = select i1 %85, float %36, float 0.000000e+00, !dbg !27
|
103 |
+
%90 = select i1 %85, float %37, float 0.000000e+00, !dbg !27
|
104 |
+
%91 = select i1 %85, float %38, float 0.000000e+00, !dbg !27
|
105 |
+
%92 = select i1 %85, float %39, float 0.000000e+00, !dbg !27
|
106 |
+
%93 = select i1 %85, float %40, float 0.000000e+00, !dbg !27
|
107 |
+
%94 = icmp eq i32 %13, 1, !dbg !28
|
108 |
+
%95 = select i1 %94, float %55, float 0.000000e+00, !dbg !29
|
109 |
+
%96 = select i1 %94, float %56, float 0.000000e+00, !dbg !29
|
110 |
+
%97 = select i1 %94, float %57, float 0.000000e+00, !dbg !29
|
111 |
+
%98 = select i1 %94, float %58, float 0.000000e+00, !dbg !29
|
112 |
+
%99 = select i1 %94, float %59, float 0.000000e+00, !dbg !29
|
113 |
+
%100 = select i1 %94, float %60, float 0.000000e+00, !dbg !29
|
114 |
+
%101 = select i1 %94, float %61, float 0.000000e+00, !dbg !29
|
115 |
+
%102 = select i1 %94, float %62, float 0.000000e+00, !dbg !29
|
116 |
+
%103 = fadd float %86, %95, !dbg !30
|
117 |
+
%104 = fadd float %87, %96, !dbg !30
|
118 |
+
%105 = fadd float %88, %97, !dbg !30
|
119 |
+
%106 = fadd float %89, %98, !dbg !30
|
120 |
+
%107 = fadd float %90, %99, !dbg !30
|
121 |
+
%108 = fadd float %91, %100, !dbg !30
|
122 |
+
%109 = fadd float %92, %101, !dbg !30
|
123 |
+
%110 = fadd float %93, %102, !dbg !30
|
124 |
+
%111 = icmp eq i32 %13, 0, !dbg !31
|
125 |
+
%112 = select i1 %111, float %77, float 0.000000e+00, !dbg !32
|
126 |
+
%113 = select i1 %111, float %78, float 0.000000e+00, !dbg !32
|
127 |
+
%114 = select i1 %111, float %79, float 0.000000e+00, !dbg !32
|
128 |
+
%115 = select i1 %111, float %80, float 0.000000e+00, !dbg !32
|
129 |
+
%116 = select i1 %111, float %81, float 0.000000e+00, !dbg !32
|
130 |
+
%117 = select i1 %111, float %82, float 0.000000e+00, !dbg !32
|
131 |
+
%118 = select i1 %111, float %83, float 0.000000e+00, !dbg !32
|
132 |
+
%119 = select i1 %111, float %84, float 0.000000e+00, !dbg !32
|
133 |
+
%120 = fadd float %103, %112, !dbg !33
|
134 |
+
%121 = fadd float %104, %113, !dbg !33
|
135 |
+
%122 = fadd float %105, %114, !dbg !33
|
136 |
+
%123 = fadd float %106, %115, !dbg !33
|
137 |
+
%124 = fadd float %107, %116, !dbg !33
|
138 |
+
%125 = fadd float %108, %117, !dbg !33
|
139 |
+
%126 = fadd float %109, %118, !dbg !33
|
140 |
+
%127 = fadd float %110, %119, !dbg !33
|
141 |
+
%128 = sext i32 %11 to i64, !dbg !34
|
142 |
+
%129 = getelementptr i16, ptr addrspace(1) %3, i64 %128, !dbg !34
|
143 |
+
%130 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %120) #1, !dbg !35
|
144 |
+
%131 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %121) #1, !dbg !35
|
145 |
+
%132 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %122) #1, !dbg !35
|
146 |
+
%133 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %123) #1, !dbg !35
|
147 |
+
%134 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %124) #1, !dbg !35
|
148 |
+
%135 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %125) #1, !dbg !35
|
149 |
+
%136 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %126) #1, !dbg !35
|
150 |
+
%137 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %127) #1, !dbg !35
|
151 |
+
%138 = insertelement <2 x i16> undef, i16 %130, i64 0, !dbg !35
|
152 |
+
%139 = insertelement <2 x i16> %138, i16 %131, i64 1, !dbg !35
|
153 |
+
%140 = bitcast <2 x i16> %139 to i32, !dbg !35
|
154 |
+
%141 = insertelement <2 x i16> undef, i16 %132, i64 0, !dbg !35
|
155 |
+
%142 = insertelement <2 x i16> %141, i16 %133, i64 1, !dbg !35
|
156 |
+
%143 = bitcast <2 x i16> %142 to i32, !dbg !35
|
157 |
+
%144 = insertelement <2 x i16> undef, i16 %134, i64 0, !dbg !35
|
158 |
+
%145 = insertelement <2 x i16> %144, i16 %135, i64 1, !dbg !35
|
159 |
+
%146 = bitcast <2 x i16> %145 to i32, !dbg !35
|
160 |
+
%147 = insertelement <2 x i16> undef, i16 %136, i64 0, !dbg !35
|
161 |
+
%148 = insertelement <2 x i16> %147, i16 %137, i64 1, !dbg !35
|
162 |
+
%149 = bitcast <2 x i16> %148 to i32, !dbg !35
|
163 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %140, i32 %143, i32 %146, i32 %149, ptr addrspace(1) %129, i1 true) #1, !dbg !35
|
164 |
+
ret void, !dbg !36
|
165 |
+
}
|
166 |
+
|
167 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
168 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
169 |
+
|
170 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
171 |
+
attributes #1 = { nounwind }
|
172 |
+
|
173 |
+
!llvm.module.flags = !{!0}
|
174 |
+
!llvm.dbg.cu = !{!1}
|
175 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
176 |
+
|
177 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
178 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
179 |
+
!2 = !DIFile(filename: "c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py", directory: "/tmp/torchinductor_root/63")
|
180 |
+
!3 = !{ptr @triton__0d1d2d3d4de, !"kernel", i32 1}
|
181 |
+
!4 = !{ptr @triton__0d1d2d3d4de, !"maxntidx", i32 128}
|
182 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4de", linkageName: "triton__0d1d2d3d4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
183 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
184 |
+
!7 = !{}
|
185 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
186 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
187 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
188 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
189 |
+
!12 = !DILocation(line: 23, column: 20, scope: !5)
|
190 |
+
!13 = !DILocation(line: 23, column: 27, scope: !5)
|
191 |
+
!14 = !DILocation(line: 25, column: 20, scope: !5)
|
192 |
+
!15 = !DILocation(line: 27, column: 40, scope: !5)
|
193 |
+
!16 = !DILocation(line: 27, column: 36, scope: !5)
|
194 |
+
!17 = !DILocation(line: 27, column: 30, scope: !5)
|
195 |
+
!18 = !DILocation(line: 27, column: 46, scope: !5)
|
196 |
+
!19 = !DILocation(line: 27, column: 85, scope: !5)
|
197 |
+
!20 = !DILocation(line: 28, column: 30, scope: !5)
|
198 |
+
!21 = !DILocation(line: 28, column: 46, scope: !5)
|
199 |
+
!22 = !DILocation(line: 28, column: 85, scope: !5)
|
200 |
+
!23 = !DILocation(line: 29, column: 31, scope: !5)
|
201 |
+
!24 = !DILocation(line: 29, column: 47, scope: !5)
|
202 |
+
!25 = !DILocation(line: 29, column: 86, scope: !5)
|
203 |
+
!26 = !DILocation(line: 32, column: 19, scope: !5)
|
204 |
+
!27 = !DILocation(line: 34, column: 32, scope: !5)
|
205 |
+
!28 = !DILocation(line: 36, column: 19, scope: !5)
|
206 |
+
!29 = !DILocation(line: 37, column: 32, scope: !5)
|
207 |
+
!30 = !DILocation(line: 38, column: 19, scope: !5)
|
208 |
+
!31 = !DILocation(line: 40, column: 20, scope: !5)
|
209 |
+
!32 = !DILocation(line: 41, column: 35, scope: !5)
|
210 |
+
!33 = !DILocation(line: 42, column: 20, scope: !5)
|
211 |
+
!34 = !DILocation(line: 43, column: 25, scope: !5)
|
212 |
+
!35 = !DILocation(line: 43, column: 37, scope: !5)
|
213 |
+
!36 = !DILocation(line: 43, column: 4, scope: !5)
|
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx
ADDED
@@ -0,0 +1,495 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2d3d4de(
|
12 |
+
.param .u64 triton__0d1d2d3d4de_param_0,
|
13 |
+
.param .u64 triton__0d1d2d3d4de_param_1,
|
14 |
+
.param .u64 triton__0d1d2d3d4de_param_2,
|
15 |
+
.param .u64 triton__0d1d2d3d4de_param_3,
|
16 |
+
.param .u32 triton__0d1d2d3d4de_param_4
|
17 |
+
)
|
18 |
+
.maxntid 128, 1, 1
|
19 |
+
{
|
20 |
+
.reg .pred %p<8>;
|
21 |
+
.reg .b16 %rs<33>;
|
22 |
+
.reg .b32 %r<77>;
|
23 |
+
.reg .f32 %f<65>;
|
24 |
+
.reg .b64 %rd<11>;
|
25 |
+
.loc 1 18 0
|
26 |
+
$L__func_begin0:
|
27 |
+
.loc 1 18 0
|
28 |
+
|
29 |
+
ld.param.u64 %rd5, [triton__0d1d2d3d4de_param_0];
|
30 |
+
ld.param.u64 %rd6, [triton__0d1d2d3d4de_param_1];
|
31 |
+
$L__tmp0:
|
32 |
+
.loc 1 21 36
|
33 |
+
mov.u32 %r50, %tid.x;
|
34 |
+
shl.b32 %r51, %r50, 3;
|
35 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4de_param_2];
|
36 |
+
and.b32 %r52, %r51, 1016;
|
37 |
+
ld.param.u64 %rd8, [triton__0d1d2d3d4de_param_3];
|
38 |
+
.loc 1 20 28
|
39 |
+
mov.u32 %r1, %ctaid.x;
|
40 |
+
.loc 1 20 33
|
41 |
+
shl.b32 %r53, %r1, 10;
|
42 |
+
.loc 1 21 23
|
43 |
+
or.b32 %r54, %r53, %r52;
|
44 |
+
.loc 1 23 20
|
45 |
+
shr.s32 %r56, %r54, 31;
|
46 |
+
shr.u32 %r57, %r56, 24;
|
47 |
+
add.s32 %r58, %r54, %r57;
|
48 |
+
shr.s32 %r59, %r58, 8;
|
49 |
+
.loc 1 23 27
|
50 |
+
mul.hi.s32 %r60, %r59, 1431655766;
|
51 |
+
shr.u32 %r61, %r60, 31;
|
52 |
+
add.s32 %r62, %r60, %r61;
|
53 |
+
mul.lo.s32 %r63, %r62, 3;
|
54 |
+
sub.s32 %r64, %r59, %r63;
|
55 |
+
and.b32 %r65, %r58, -256;
|
56 |
+
sub.s32 %r66, %r54, %r65;
|
57 |
+
.loc 1 25 20
|
58 |
+
mul.hi.s32 %r67, %r54, 715827883;
|
59 |
+
shr.u32 %r68, %r67, 31;
|
60 |
+
shr.u32 %r69, %r67, 7;
|
61 |
+
add.s32 %r70, %r69, %r68;
|
62 |
+
.loc 1 27 40
|
63 |
+
shl.b32 %r71, %r70, 8;
|
64 |
+
.loc 1 27 36
|
65 |
+
add.s32 %r72, %r71, %r66;
|
66 |
+
.loc 1 27 30
|
67 |
+
mul.wide.s32 %rd9, %r72, 2;
|
68 |
+
add.s64 %rd1, %rd5, %rd9;
|
69 |
+
mov.pred %p1, -1;
|
70 |
+
.loc 1 27 46
|
71 |
+
mov.u32 %r2, 0x0;
|
72 |
+
mov.u32 %r3, 0x0;
|
73 |
+
mov.u32 %r4, 0x0;
|
74 |
+
mov.u32 %r5, 0x0;
|
75 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
76 |
+
cvt.u16.u32 %rs1, %r2;
|
77 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
78 |
+
cvt.u16.u32 %rs3, %r3;
|
79 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
80 |
+
cvt.u16.u32 %rs5, %r4;
|
81 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
|
82 |
+
cvt.u16.u32 %rs7, %r5;
|
83 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
|
84 |
+
.loc 1 27 85
|
85 |
+
cvt.f32.bf16 %r6, %rs1;
|
86 |
+
mov.b32 %f1, %r6;
|
87 |
+
cvt.f32.bf16 %r7, %rs2;
|
88 |
+
mov.b32 %f2, %r7;
|
89 |
+
cvt.f32.bf16 %r8, %rs3;
|
90 |
+
mov.b32 %f3, %r8;
|
91 |
+
cvt.f32.bf16 %r9, %rs4;
|
92 |
+
mov.b32 %f4, %r9;
|
93 |
+
cvt.f32.bf16 %r10, %rs5;
|
94 |
+
mov.b32 %f5, %r10;
|
95 |
+
cvt.f32.bf16 %r11, %rs6;
|
96 |
+
mov.b32 %f6, %r11;
|
97 |
+
cvt.f32.bf16 %r12, %rs7;
|
98 |
+
mov.b32 %f7, %r12;
|
99 |
+
cvt.f32.bf16 %r13, %rs8;
|
100 |
+
mov.b32 %f8, %r13;
|
101 |
+
.loc 1 28 30
|
102 |
+
add.s64 %rd2, %rd6, %rd9;
|
103 |
+
.loc 1 28 46
|
104 |
+
mov.u32 %r14, 0x0;
|
105 |
+
mov.u32 %r15, 0x0;
|
106 |
+
mov.u32 %r16, 0x0;
|
107 |
+
mov.u32 %r17, 0x0;
|
108 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ];
|
109 |
+
cvt.u16.u32 %rs9, %r14;
|
110 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; }
|
111 |
+
cvt.u16.u32 %rs11, %r15;
|
112 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; }
|
113 |
+
cvt.u16.u32 %rs13, %r16;
|
114 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; }
|
115 |
+
cvt.u16.u32 %rs15, %r17;
|
116 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; }
|
117 |
+
.loc 1 28 85
|
118 |
+
cvt.f32.bf16 %r18, %rs9;
|
119 |
+
mov.b32 %f9, %r18;
|
120 |
+
cvt.f32.bf16 %r19, %rs10;
|
121 |
+
mov.b32 %f10, %r19;
|
122 |
+
cvt.f32.bf16 %r20, %rs11;
|
123 |
+
mov.b32 %f11, %r20;
|
124 |
+
cvt.f32.bf16 %r21, %rs12;
|
125 |
+
mov.b32 %f12, %r21;
|
126 |
+
cvt.f32.bf16 %r22, %rs13;
|
127 |
+
mov.b32 %f13, %r22;
|
128 |
+
cvt.f32.bf16 %r23, %rs14;
|
129 |
+
mov.b32 %f14, %r23;
|
130 |
+
cvt.f32.bf16 %r24, %rs15;
|
131 |
+
mov.b32 %f15, %r24;
|
132 |
+
cvt.f32.bf16 %r25, %rs16;
|
133 |
+
mov.b32 %f16, %r25;
|
134 |
+
.loc 1 29 31
|
135 |
+
add.s64 %rd3, %rd7, %rd9;
|
136 |
+
.loc 1 29 47
|
137 |
+
mov.u32 %r26, 0x0;
|
138 |
+
mov.u32 %r27, 0x0;
|
139 |
+
mov.u32 %r28, 0x0;
|
140 |
+
mov.u32 %r29, 0x0;
|
141 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd3 + 0 ];
|
142 |
+
cvt.u16.u32 %rs17, %r26;
|
143 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r26; }
|
144 |
+
cvt.u16.u32 %rs19, %r27;
|
145 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r27; }
|
146 |
+
cvt.u16.u32 %rs21, %r28;
|
147 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r28; }
|
148 |
+
cvt.u16.u32 %rs23, %r29;
|
149 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r29; }
|
150 |
+
.loc 1 29 86
|
151 |
+
cvt.f32.bf16 %r30, %rs17;
|
152 |
+
mov.b32 %f17, %r30;
|
153 |
+
cvt.f32.bf16 %r31, %rs18;
|
154 |
+
mov.b32 %f18, %r31;
|
155 |
+
cvt.f32.bf16 %r32, %rs19;
|
156 |
+
mov.b32 %f19, %r32;
|
157 |
+
cvt.f32.bf16 %r33, %rs20;
|
158 |
+
mov.b32 %f20, %r33;
|
159 |
+
cvt.f32.bf16 %r34, %rs21;
|
160 |
+
mov.b32 %f21, %r34;
|
161 |
+
cvt.f32.bf16 %r35, %rs22;
|
162 |
+
mov.b32 %f22, %r35;
|
163 |
+
cvt.f32.bf16 %r36, %rs23;
|
164 |
+
mov.b32 %f23, %r36;
|
165 |
+
cvt.f32.bf16 %r37, %rs24;
|
166 |
+
mov.b32 %f24, %r37;
|
167 |
+
.loc 1 32 19
|
168 |
+
setp.eq.s32 %p5, %r64, 2;
|
169 |
+
.loc 1 34 32
|
170 |
+
selp.f32 %f25, %f1, 0f00000000, %p5;
|
171 |
+
selp.f32 %f26, %f2, 0f00000000, %p5;
|
172 |
+
selp.f32 %f27, %f3, 0f00000000, %p5;
|
173 |
+
selp.f32 %f28, %f4, 0f00000000, %p5;
|
174 |
+
selp.f32 %f29, %f5, 0f00000000, %p5;
|
175 |
+
selp.f32 %f30, %f6, 0f00000000, %p5;
|
176 |
+
selp.f32 %f31, %f7, 0f00000000, %p5;
|
177 |
+
selp.f32 %f32, %f8, 0f00000000, %p5;
|
178 |
+
.loc 1 36 19
|
179 |
+
setp.eq.s32 %p6, %r64, 1;
|
180 |
+
.loc 1 37 32
|
181 |
+
selp.f32 %f33, %f9, 0f00000000, %p6;
|
182 |
+
selp.f32 %f34, %f10, 0f00000000, %p6;
|
183 |
+
selp.f32 %f35, %f11, 0f00000000, %p6;
|
184 |
+
selp.f32 %f36, %f12, 0f00000000, %p6;
|
185 |
+
selp.f32 %f37, %f13, 0f00000000, %p6;
|
186 |
+
selp.f32 %f38, %f14, 0f00000000, %p6;
|
187 |
+
selp.f32 %f39, %f15, 0f00000000, %p6;
|
188 |
+
selp.f32 %f40, %f16, 0f00000000, %p6;
|
189 |
+
.loc 1 38 19
|
190 |
+
add.f32 %f41, %f25, %f33;
|
191 |
+
add.f32 %f42, %f26, %f34;
|
192 |
+
add.f32 %f43, %f27, %f35;
|
193 |
+
add.f32 %f44, %f28, %f36;
|
194 |
+
add.f32 %f45, %f29, %f37;
|
195 |
+
add.f32 %f46, %f30, %f38;
|
196 |
+
add.f32 %f47, %f31, %f39;
|
197 |
+
add.f32 %f48, %f32, %f40;
|
198 |
+
.loc 1 40 20
|
199 |
+
setp.eq.s32 %p7, %r64, 0;
|
200 |
+
.loc 1 41 35
|
201 |
+
selp.f32 %f49, %f17, 0f00000000, %p7;
|
202 |
+
selp.f32 %f50, %f18, 0f00000000, %p7;
|
203 |
+
selp.f32 %f51, %f19, 0f00000000, %p7;
|
204 |
+
selp.f32 %f52, %f20, 0f00000000, %p7;
|
205 |
+
selp.f32 %f53, %f21, 0f00000000, %p7;
|
206 |
+
selp.f32 %f54, %f22, 0f00000000, %p7;
|
207 |
+
selp.f32 %f55, %f23, 0f00000000, %p7;
|
208 |
+
selp.f32 %f56, %f24, 0f00000000, %p7;
|
209 |
+
.loc 1 42 20
|
210 |
+
add.f32 %f57, %f41, %f49;
|
211 |
+
add.f32 %f58, %f42, %f50;
|
212 |
+
add.f32 %f59, %f43, %f51;
|
213 |
+
add.f32 %f60, %f44, %f52;
|
214 |
+
add.f32 %f61, %f45, %f53;
|
215 |
+
add.f32 %f62, %f46, %f54;
|
216 |
+
add.f32 %f63, %f47, %f55;
|
217 |
+
add.f32 %f64, %f48, %f56;
|
218 |
+
.loc 1 43 25
|
219 |
+
mul.wide.s32 %rd10, %r54, 2;
|
220 |
+
add.s64 %rd4, %rd8, %rd10;
|
221 |
+
.loc 1 43 37
|
222 |
+
mov.b32 %r38, %f57;
|
223 |
+
cvt.rn.bf16.f32 %rs25, %r38;
|
224 |
+
mov.b32 %r39, %f58;
|
225 |
+
cvt.rn.bf16.f32 %rs26, %r39;
|
226 |
+
mov.b32 %r40, %f59;
|
227 |
+
cvt.rn.bf16.f32 %rs27, %r40;
|
228 |
+
mov.b32 %r41, %f60;
|
229 |
+
cvt.rn.bf16.f32 %rs28, %r41;
|
230 |
+
mov.b32 %r42, %f61;
|
231 |
+
cvt.rn.bf16.f32 %rs29, %r42;
|
232 |
+
mov.b32 %r43, %f62;
|
233 |
+
cvt.rn.bf16.f32 %rs30, %r43;
|
234 |
+
mov.b32 %r44, %f63;
|
235 |
+
cvt.rn.bf16.f32 %rs31, %r44;
|
236 |
+
mov.b32 %r45, %f64;
|
237 |
+
cvt.rn.bf16.f32 %rs32, %r45;
|
238 |
+
mov.b32 %r73, {%rs25, %rs26};
|
239 |
+
mov.b32 %r74, {%rs27, %rs28};
|
240 |
+
mov.b32 %r75, {%rs29, %rs30};
|
241 |
+
mov.b32 %r76, {%rs31, %rs32};
|
242 |
+
@%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r73, %r74, %r75, %r76 };
|
243 |
+
.loc 1 43 4
|
244 |
+
ret;
|
245 |
+
$L__tmp1:
|
246 |
+
$L__func_end0:
|
247 |
+
|
248 |
+
}
|
249 |
+
.file 1 "/tmp/torchinductor_root/63/c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py"
|
250 |
+
.section .debug_abbrev
|
251 |
+
{
|
252 |
+
.b8 1
|
253 |
+
.b8 17
|
254 |
+
.b8 1
|
255 |
+
.b8 37
|
256 |
+
.b8 8
|
257 |
+
.b8 19
|
258 |
+
.b8 5
|
259 |
+
.b8 3
|
260 |
+
.b8 8
|
261 |
+
.b8 16
|
262 |
+
.b8 6
|
263 |
+
.b8 27
|
264 |
+
.b8 8
|
265 |
+
.b8 180
|
266 |
+
.b8 66
|
267 |
+
.b8 12
|
268 |
+
.b8 17
|
269 |
+
.b8 1
|
270 |
+
.b8 18
|
271 |
+
.b8 1
|
272 |
+
.b8 0
|
273 |
+
.b8 0
|
274 |
+
.b8 2
|
275 |
+
.b8 46
|
276 |
+
.b8 0
|
277 |
+
.b8 17
|
278 |
+
.b8 1
|
279 |
+
.b8 18
|
280 |
+
.b8 1
|
281 |
+
.b8 64
|
282 |
+
.b8 10
|
283 |
+
.b8 135
|
284 |
+
.b8 64
|
285 |
+
.b8 8
|
286 |
+
.b8 3
|
287 |
+
.b8 8
|
288 |
+
.b8 58
|
289 |
+
.b8 11
|
290 |
+
.b8 59
|
291 |
+
.b8 11
|
292 |
+
.b8 63
|
293 |
+
.b8 12
|
294 |
+
.b8 0
|
295 |
+
.b8 0
|
296 |
+
.b8 0
|
297 |
+
}
|
298 |
+
.section .debug_info
|
299 |
+
{
|
300 |
+
.b32 184
|
301 |
+
.b8 2
|
302 |
+
.b8 0
|
303 |
+
.b32 .debug_abbrev
|
304 |
+
.b8 8
|
305 |
+
.b8 1
|
306 |
+
.b8 116
|
307 |
+
.b8 114
|
308 |
+
.b8 105
|
309 |
+
.b8 116
|
310 |
+
.b8 111
|
311 |
+
.b8 110
|
312 |
+
.b8 0
|
313 |
+
.b8 2
|
314 |
+
.b8 0
|
315 |
+
.b8 99
|
316 |
+
.b8 54
|
317 |
+
.b8 51
|
318 |
+
.b8 114
|
319 |
+
.b8 55
|
320 |
+
.b8 105
|
321 |
+
.b8 117
|
322 |
+
.b8 114
|
323 |
+
.b8 119
|
324 |
+
.b8 107
|
325 |
+
.b8 53
|
326 |
+
.b8 121
|
327 |
+
.b8 100
|
328 |
+
.b8 108
|
329 |
+
.b8 115
|
330 |
+
.b8 119
|
331 |
+
.b8 104
|
332 |
+
.b8 55
|
333 |
+
.b8 114
|
334 |
+
.b8 118
|
335 |
+
.b8 104
|
336 |
+
.b8 99
|
337 |
+
.b8 109
|
338 |
+
.b8 108
|
339 |
+
.b8 120
|
340 |
+
.b8 50
|
341 |
+
.b8 99
|
342 |
+
.b8 102
|
343 |
+
.b8 114
|
344 |
+
.b8 101
|
345 |
+
.b8 116
|
346 |
+
.b8 108
|
347 |
+
.b8 114
|
348 |
+
.b8 101
|
349 |
+
.b8 119
|
350 |
+
.b8 103
|
351 |
+
.b8 119
|
352 |
+
.b8 54
|
353 |
+
.b8 116
|
354 |
+
.b8 108
|
355 |
+
.b8 106
|
356 |
+
.b8 108
|
357 |
+
.b8 117
|
358 |
+
.b8 114
|
359 |
+
.b8 115
|
360 |
+
.b8 115
|
361 |
+
.b8 104
|
362 |
+
.b8 103
|
363 |
+
.b8 116
|
364 |
+
.b8 102
|
365 |
+
.b8 112
|
366 |
+
.b8 112
|
367 |
+
.b8 46
|
368 |
+
.b8 112
|
369 |
+
.b8 121
|
370 |
+
.b8 0
|
371 |
+
.b32 .debug_line
|
372 |
+
.b8 47
|
373 |
+
.b8 116
|
374 |
+
.b8 109
|
375 |
+
.b8 112
|
376 |
+
.b8 47
|
377 |
+
.b8 116
|
378 |
+
.b8 111
|
379 |
+
.b8 114
|
380 |
+
.b8 99
|
381 |
+
.b8 104
|
382 |
+
.b8 105
|
383 |
+
.b8 110
|
384 |
+
.b8 100
|
385 |
+
.b8 117
|
386 |
+
.b8 99
|
387 |
+
.b8 116
|
388 |
+
.b8 111
|
389 |
+
.b8 114
|
390 |
+
.b8 95
|
391 |
+
.b8 114
|
392 |
+
.b8 111
|
393 |
+
.b8 111
|
394 |
+
.b8 116
|
395 |
+
.b8 47
|
396 |
+
.b8 54
|
397 |
+
.b8 51
|
398 |
+
.b8 0
|
399 |
+
.b8 1
|
400 |
+
.b64 $L__func_begin0
|
401 |
+
.b64 $L__func_end0
|
402 |
+
.b8 2
|
403 |
+
.b64 $L__func_begin0
|
404 |
+
.b64 $L__func_end0
|
405 |
+
.b8 1
|
406 |
+
.b8 156
|
407 |
+
.b8 116
|
408 |
+
.b8 114
|
409 |
+
.b8 105
|
410 |
+
.b8 116
|
411 |
+
.b8 111
|
412 |
+
.b8 110
|
413 |
+
.b8 95
|
414 |
+
.b8 95
|
415 |
+
.b8 48
|
416 |
+
.b8 100
|
417 |
+
.b8 49
|
418 |
+
.b8 100
|
419 |
+
.b8 50
|
420 |
+
.b8 100
|
421 |
+
.b8 51
|
422 |
+
.b8 100
|
423 |
+
.b8 52
|
424 |
+
.b8 100
|
425 |
+
.b8 101
|
426 |
+
.b8 0
|
427 |
+
.b8 116
|
428 |
+
.b8 114
|
429 |
+
.b8 105
|
430 |
+
.b8 116
|
431 |
+
.b8 111
|
432 |
+
.b8 110
|
433 |
+
.b8 95
|
434 |
+
.b8 95
|
435 |
+
.b8 48
|
436 |
+
.b8 100
|
437 |
+
.b8 49
|
438 |
+
.b8 100
|
439 |
+
.b8 50
|
440 |
+
.b8 100
|
441 |
+
.b8 51
|
442 |
+
.b8 100
|
443 |
+
.b8 52
|
444 |
+
.b8 100
|
445 |
+
.b8 101
|
446 |
+
.b8 0
|
447 |
+
.b8 1
|
448 |
+
.b8 18
|
449 |
+
.b8 1
|
450 |
+
.b8 0
|
451 |
+
}
|
452 |
+
.section .debug_pubnames
|
453 |
+
{
|
454 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
455 |
+
$L__pubNames_start0:
|
456 |
+
.b8 2
|
457 |
+
.b8 0
|
458 |
+
.b32 .debug_info
|
459 |
+
.b32 188
|
460 |
+
.b32 125
|
461 |
+
.b8 116
|
462 |
+
.b8 114
|
463 |
+
.b8 105
|
464 |
+
.b8 116
|
465 |
+
.b8 111
|
466 |
+
.b8 110
|
467 |
+
.b8 95
|
468 |
+
.b8 95
|
469 |
+
.b8 48
|
470 |
+
.b8 100
|
471 |
+
.b8 49
|
472 |
+
.b8 100
|
473 |
+
.b8 50
|
474 |
+
.b8 100
|
475 |
+
.b8 51
|
476 |
+
.b8 100
|
477 |
+
.b8 52
|
478 |
+
.b8 100
|
479 |
+
.b8 101
|
480 |
+
.b8 0
|
481 |
+
.b32 0
|
482 |
+
$L__pubNames_end0:
|
483 |
+
}
|
484 |
+
.section .debug_pubtypes
|
485 |
+
{
|
486 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
487 |
+
$L__pubTypes_start0:
|
488 |
+
.b8 2
|
489 |
+
.b8 0
|
490 |
+
.b32 .debug_info
|
491 |
+
.b32 188
|
492 |
+
.b32 0
|
493 |
+
$L__pubTypes_end0:
|
494 |
+
}
|
495 |
+
.section .debug_loc { }
|
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx
ADDED
@@ -0,0 +1,486 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2de(
|
13 |
+
.param .u64 triton__0d1d2de_param_0,
|
14 |
+
.param .u64 triton__0d1d2de_param_1,
|
15 |
+
.param .u32 triton__0d1d2de_param_2
|
16 |
+
)
|
17 |
+
.maxntid 256, 1, 1
|
18 |
+
{
|
19 |
+
.reg .pred %p<10>;
|
20 |
+
.reg .b16 %rs<7>;
|
21 |
+
.reg .b32 %r<25>;
|
22 |
+
.reg .f32 %f<127>;
|
23 |
+
.reg .b64 %rd<8>;
|
24 |
+
.loc 1 18 0
|
25 |
+
$L__func_begin0:
|
26 |
+
.loc 1 18 0
|
27 |
+
|
28 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_0];
|
29 |
+
ld.param.u64 %rd5, [triton__0d1d2de_param_1];
|
30 |
+
$L__tmp0:
|
31 |
+
.loc 1 21 36
|
32 |
+
mov.u32 %r8, %tid.x;
|
33 |
+
shl.b32 %r9, %r8, 1;
|
34 |
+
and.b32 %r10, %r9, 510;
|
35 |
+
.loc 1 20 28
|
36 |
+
mov.u32 %r1, %ctaid.x;
|
37 |
+
.loc 1 20 33
|
38 |
+
shl.b32 %r11, %r1, 9;
|
39 |
+
.loc 1 21 23
|
40 |
+
or.b32 %r12, %r11, %r10;
|
41 |
+
.loc 1 24 34
|
42 |
+
mul.wide.s32 %rd6, %r12, 2;
|
43 |
+
add.s64 %rd7, %rd4, %rd6;
|
44 |
+
mov.pred %p1, -1;
|
45 |
+
.loc 1 24 39
|
46 |
+
mov.u32 %r2, 0x0;
|
47 |
+
@%p1 ld.global.b32 { %r2 }, [ %rd7 + 0 ];
|
48 |
+
.loc 1 25 30
|
49 |
+
add.s64 %rd3, %rd5, %rd6;
|
50 |
+
.loc 1 25 35
|
51 |
+
mov.u32 %r5, 0x0;
|
52 |
+
@%p1 ld.global.b32 { %r5 }, [ %rd3 + 0 ];
|
53 |
+
cvt.u16.u32 %rs3, %r5;
|
54 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r5; }
|
55 |
+
.loc 1 25 44
|
56 |
+
cvt.f32.bf16 %r6, %rs3;
|
57 |
+
mov.b32 %f3, %r6;
|
58 |
+
cvt.f32.bf16 %r7, %rs4;
|
59 |
+
mov.b32 %f4, %r7;
|
60 |
+
.loc 1 29 18
|
61 |
+
mul.f32 %f5, %f3, 0f3F3504F3;
|
62 |
+
.loc 1 30 23
|
63 |
+
abs.ftz.f32 %f7, %f5;
|
64 |
+
setp.ge.f32 %p3, %f7, 0f3F8060FE;
|
65 |
+
mov.f32 %f115, 0f3789CA3C;
|
66 |
+
mov.f32 %f114, 0fB9F560B9;
|
67 |
+
mov.f32 %f113, 0f3BAC840B;
|
68 |
+
mov.f32 %f112, 0fBD0C8162;
|
69 |
+
mov.f32 %f111, 0f3E1CF906;
|
70 |
+
mov.f32 %f110, 0f3F6A937E;
|
71 |
+
mov.f32 %f109, 0f3F20D842;
|
72 |
+
mov.f32 %f116, %f7;
|
73 |
+
@%p3 bra $L__BB0_2;
|
74 |
+
.loc 1 0 23
|
75 |
+
mov.f32 %f115, 0f38B1E96A;
|
76 |
+
mov.f32 %f114, 0fBA574D20;
|
77 |
+
mov.f32 %f113, 0f3BAAD5EA;
|
78 |
+
mov.f32 %f112, 0fBCDC1BE7;
|
79 |
+
mov.f32 %f111, 0f3DE718AF;
|
80 |
+
mov.f32 %f110, 0fBEC093AC;
|
81 |
+
mov.f32 %f109, 0f3E0375D3;
|
82 |
+
.loc 1 30 23
|
83 |
+
mul.f32 %f116, %f5, %f5;
|
84 |
+
$L__BB0_2:
|
85 |
+
.loc 1 0 0
|
86 |
+
cvt.u16.u32 %rs1, %r2;
|
87 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
88 |
+
mul.f32 %f6, %f4, 0f3F3504F3;
|
89 |
+
.loc 1 30 23
|
90 |
+
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
|
91 |
+
fma.rn.ftz.f32 %f47, %f115, %f116, %f114;
|
92 |
+
fma.rn.ftz.f32 %f48, %f47, %f116, %f113;
|
93 |
+
fma.rn.ftz.f32 %f49, %f48, %f116, %f112;
|
94 |
+
fma.rn.ftz.f32 %f50, %f49, %f116, %f111;
|
95 |
+
fma.rn.ftz.f32 %f51, %f50, %f116, %f110;
|
96 |
+
fma.rn.ftz.f32 %f52, %f51, %f116, %f109;
|
97 |
+
neg.f32 %f53, %f116;
|
98 |
+
selp.f32 %f54, %f53, %f5, %p3;
|
99 |
+
fma.rn.ftz.f32 %f117, %f52, %f54, %f54;
|
100 |
+
mov.f32 %f108, 0f3F800000;
|
101 |
+
@%p4 bra $L__BB0_4;
|
102 |
+
ex2.approx.ftz.f32 %f55, %f117;
|
103 |
+
sub.f32 %f57, %f108, %f55;
|
104 |
+
mov.b32 %r13, %f57;
|
105 |
+
mov.b32 %r14, %f5;
|
106 |
+
and.b32 %r15, %r14, -2147483648;
|
107 |
+
or.b32 %r16, %r15, %r13;
|
108 |
+
mov.b32 %f117, %r16;
|
109 |
+
$L__BB0_4:
|
110 |
+
.loc 1 0 0
|
111 |
+
cvt.f32.bf16 %r3, %rs1;
|
112 |
+
cvt.f32.bf16 %r4, %rs2;
|
113 |
+
.loc 1 30 23
|
114 |
+
abs.ftz.f32 %f20, %f6;
|
115 |
+
setp.ge.f32 %p6, %f20, 0f3F8060FE;
|
116 |
+
mov.f32 %f124, 0f3789CA3C;
|
117 |
+
mov.f32 %f123, 0fB9F560B9;
|
118 |
+
mov.f32 %f122, 0f3BAC840B;
|
119 |
+
mov.f32 %f121, 0fBD0C8162;
|
120 |
+
mov.f32 %f120, 0f3E1CF906;
|
121 |
+
mov.f32 %f119, 0f3F6A937E;
|
122 |
+
mov.f32 %f118, 0f3F20D842;
|
123 |
+
mov.f32 %f125, %f20;
|
124 |
+
@%p6 bra $L__BB0_6;
|
125 |
+
mul.f32 %f125, %f6, %f6;
|
126 |
+
mov.f32 %f124, 0f38B1E96A;
|
127 |
+
mov.f32 %f123, 0fBA574D20;
|
128 |
+
mov.f32 %f122, 0f3BAAD5EA;
|
129 |
+
mov.f32 %f121, 0fBCDC1BE7;
|
130 |
+
mov.f32 %f120, 0f3DE718AF;
|
131 |
+
mov.f32 %f119, 0fBEC093AC;
|
132 |
+
mov.f32 %f118, 0f3E0375D3;
|
133 |
+
$L__BB0_6:
|
134 |
+
.loc 1 0 0
|
135 |
+
mov.b32 %f1, %r3;
|
136 |
+
mov.b32 %f2, %r4;
|
137 |
+
.loc 1 30 23
|
138 |
+
setp.ltu.f32 %p7, %f20, 0f3F8060FE;
|
139 |
+
fma.rn.ftz.f32 %f72, %f124, %f125, %f123;
|
140 |
+
fma.rn.ftz.f32 %f73, %f72, %f125, %f122;
|
141 |
+
fma.rn.ftz.f32 %f74, %f73, %f125, %f121;
|
142 |
+
fma.rn.ftz.f32 %f75, %f74, %f125, %f120;
|
143 |
+
fma.rn.ftz.f32 %f76, %f75, %f125, %f119;
|
144 |
+
fma.rn.ftz.f32 %f77, %f76, %f125, %f118;
|
145 |
+
neg.f32 %f78, %f125;
|
146 |
+
selp.f32 %f79, %f78, %f6, %p6;
|
147 |
+
fma.rn.ftz.f32 %f126, %f77, %f79, %f79;
|
148 |
+
@%p7 bra $L__BB0_8;
|
149 |
+
ex2.approx.ftz.f32 %f80, %f126;
|
150 |
+
sub.f32 %f82, %f108, %f80;
|
151 |
+
mov.b32 %r17, %f82;
|
152 |
+
mov.b32 %r18, %f6;
|
153 |
+
and.b32 %r19, %r18, -2147483648;
|
154 |
+
or.b32 %r20, %r19, %r17;
|
155 |
+
mov.b32 %f126, %r20;
|
156 |
+
$L__BB0_8:
|
157 |
+
.loc 1 32 18
|
158 |
+
add.f32 %f87, %f117, 0f3F800000;
|
159 |
+
add.f32 %f88, %f126, 0f3F800000;
|
160 |
+
.loc 1 35 19
|
161 |
+
mul.f32 %f89, %f3, %f3;
|
162 |
+
mul.f32 %f90, %f4, %f4;
|
163 |
+
.loc 1 37 20
|
164 |
+
mul.f32 %f91, %f89, 0fBF000000;
|
165 |
+
mul.f32 %f92, %f90, 0fBF000000;
|
166 |
+
.loc 1 38 19
|
167 |
+
mul.f32 %f84, %f91, 0f3FB8AA3B;
|
168 |
+
ex2.approx.f32 %f83, %f84;
|
169 |
+
mul.f32 %f86, %f92, 0f3FB8AA3B;
|
170 |
+
ex2.approx.f32 %f85, %f86;
|
171 |
+
.loc 1 40 20
|
172 |
+
mul.f32 %f93, %f83, 0f3ECC422A;
|
173 |
+
mul.f32 %f94, %f85, 0f3ECC422A;
|
174 |
+
.loc 1 41 19
|
175 |
+
mul.f32 %f95, %f3, %f93;
|
176 |
+
mul.f32 %f96, %f4, %f94;
|
177 |
+
.loc 1 42 20
|
178 |
+
fma.rn.f32 %f97, %f87, 0f3F000000, %f95;
|
179 |
+
fma.rn.f32 %f98, %f88, 0f3F000000, %f96;
|
180 |
+
.loc 1 43 19
|
181 |
+
mul.f32 %f99, %f1, %f97;
|
182 |
+
mul.f32 %f100, %f2, %f98;
|
183 |
+
.loc 1 45 40
|
184 |
+
mov.b32 %r21, %f99;
|
185 |
+
cvt.rn.bf16.f32 %rs5, %r21;
|
186 |
+
mov.b32 %r22, %f100;
|
187 |
+
cvt.rn.bf16.f32 %rs6, %r22;
|
188 |
+
mov.b32 %r24, {%rs5, %rs6};
|
189 |
+
@%p1 st.global.b32 [ %rd7 + 0 ], { %r24 };
|
190 |
+
.loc 1 45 4
|
191 |
+
ret;
|
192 |
+
$L__tmp1:
|
193 |
+
$L__func_end0:
|
194 |
+
|
195 |
+
}
|
196 |
+
// .globl __nv_erff
|
197 |
+
.visible .func (.param .b32 func_retval0) __nv_erff(
|
198 |
+
.param .b32 __nv_erff_param_0
|
199 |
+
)
|
200 |
+
{
|
201 |
+
.reg .pred %p<4>;
|
202 |
+
.reg .b32 %r<5>;
|
203 |
+
.reg .f32 %f<49>;
|
204 |
+
$L__func_begin1:
|
205 |
+
|
206 |
+
ld.param.f32 %f14, [__nv_erff_param_0];
|
207 |
+
abs.ftz.f32 %f1, %f14;
|
208 |
+
setp.ge.f32 %p1, %f1, 0f3F8060FE;
|
209 |
+
mov.f32 %f46, 0f3789CA3C;
|
210 |
+
mov.f32 %f45, 0fB9F560B9;
|
211 |
+
mov.f32 %f44, 0f3BAC840B;
|
212 |
+
mov.f32 %f43, 0fBD0C8162;
|
213 |
+
mov.f32 %f42, 0f3E1CF906;
|
214 |
+
mov.f32 %f41, 0f3F6A937E;
|
215 |
+
mov.f32 %f40, 0f3F20D842;
|
216 |
+
mov.f32 %f47, %f1;
|
217 |
+
@%p1 bra $L__BB1_2;
|
218 |
+
mul.f32 %f47, %f14, %f14;
|
219 |
+
mov.f32 %f46, 0f38B1E96A;
|
220 |
+
mov.f32 %f45, 0fBA574D20;
|
221 |
+
mov.f32 %f44, 0f3BAAD5EA;
|
222 |
+
mov.f32 %f43, 0fBCDC1BE7;
|
223 |
+
mov.f32 %f42, 0f3DE718AF;
|
224 |
+
mov.f32 %f41, 0fBEC093AC;
|
225 |
+
mov.f32 %f40, 0f3E0375D3;
|
226 |
+
$L__BB1_2:
|
227 |
+
setp.ltu.f32 %p2, %f1, 0f3F8060FE;
|
228 |
+
fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
|
229 |
+
fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
|
230 |
+
fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
|
231 |
+
fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
|
232 |
+
fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
|
233 |
+
fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
|
234 |
+
neg.f32 %f35, %f47;
|
235 |
+
selp.f32 %f36, %f35, %f14, %p1;
|
236 |
+
fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
|
237 |
+
@%p2 bra $L__BB1_4;
|
238 |
+
ex2.approx.ftz.f32 %f37, %f48;
|
239 |
+
mov.f32 %f38, 0f3F800000;
|
240 |
+
sub.f32 %f39, %f38, %f37;
|
241 |
+
mov.b32 %r1, %f39;
|
242 |
+
mov.b32 %r2, %f14;
|
243 |
+
and.b32 %r3, %r2, -2147483648;
|
244 |
+
or.b32 %r4, %r3, %r1;
|
245 |
+
mov.b32 %f48, %r4;
|
246 |
+
$L__BB1_4:
|
247 |
+
st.param.f32 [func_retval0+0], %f48;
|
248 |
+
ret;
|
249 |
+
$L__func_end1:
|
250 |
+
|
251 |
+
}
|
252 |
+
.file 1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py"
|
253 |
+
.section .debug_abbrev
|
254 |
+
{
|
255 |
+
.b8 1
|
256 |
+
.b8 17
|
257 |
+
.b8 1
|
258 |
+
.b8 37
|
259 |
+
.b8 8
|
260 |
+
.b8 19
|
261 |
+
.b8 5
|
262 |
+
.b8 3
|
263 |
+
.b8 8
|
264 |
+
.b8 16
|
265 |
+
.b8 6
|
266 |
+
.b8 27
|
267 |
+
.b8 8
|
268 |
+
.b8 180
|
269 |
+
.b8 66
|
270 |
+
.b8 12
|
271 |
+
.b8 17
|
272 |
+
.b8 1
|
273 |
+
.b8 18
|
274 |
+
.b8 1
|
275 |
+
.b8 0
|
276 |
+
.b8 0
|
277 |
+
.b8 2
|
278 |
+
.b8 46
|
279 |
+
.b8 0
|
280 |
+
.b8 17
|
281 |
+
.b8 1
|
282 |
+
.b8 18
|
283 |
+
.b8 1
|
284 |
+
.b8 64
|
285 |
+
.b8 10
|
286 |
+
.b8 135
|
287 |
+
.b8 64
|
288 |
+
.b8 8
|
289 |
+
.b8 3
|
290 |
+
.b8 8
|
291 |
+
.b8 58
|
292 |
+
.b8 11
|
293 |
+
.b8 59
|
294 |
+
.b8 11
|
295 |
+
.b8 63
|
296 |
+
.b8 12
|
297 |
+
.b8 0
|
298 |
+
.b8 0
|
299 |
+
.b8 0
|
300 |
+
}
|
301 |
+
.section .debug_info
|
302 |
+
{
|
303 |
+
.b32 176
|
304 |
+
.b8 2
|
305 |
+
.b8 0
|
306 |
+
.b32 .debug_abbrev
|
307 |
+
.b8 8
|
308 |
+
.b8 1
|
309 |
+
.b8 116
|
310 |
+
.b8 114
|
311 |
+
.b8 105
|
312 |
+
.b8 116
|
313 |
+
.b8 111
|
314 |
+
.b8 110
|
315 |
+
.b8 0
|
316 |
+
.b8 2
|
317 |
+
.b8 0
|
318 |
+
.b8 99
|
319 |
+
.b8 53
|
320 |
+
.b8 106
|
321 |
+
.b8 120
|
322 |
+
.b8 97
|
323 |
+
.b8 103
|
324 |
+
.b8 117
|
325 |
+
.b8 120
|
326 |
+
.b8 104
|
327 |
+
.b8 111
|
328 |
+
.b8 51
|
329 |
+
.b8 110
|
330 |
+
.b8 104
|
331 |
+
.b8 114
|
332 |
+
.b8 108
|
333 |
+
.b8 116
|
334 |
+
.b8 53
|
335 |
+
.b8 118
|
336 |
+
.b8 99
|
337 |
+
.b8 105
|
338 |
+
.b8 110
|
339 |
+
.b8 110
|
340 |
+
.b8 122
|
341 |
+
.b8 53
|
342 |
+
.b8 102
|
343 |
+
.b8 101
|
344 |
+
.b8 118
|
345 |
+
.b8 111
|
346 |
+
.b8 100
|
347 |
+
.b8 117
|
348 |
+
.b8 109
|
349 |
+
.b8 108
|
350 |
+
.b8 112
|
351 |
+
.b8 119
|
352 |
+
.b8 110
|
353 |
+
.b8 52
|
354 |
+
.b8 119
|
355 |
+
.b8 121
|
356 |
+
.b8 98
|
357 |
+
.b8 50
|
358 |
+
.b8 118
|
359 |
+
.b8 120
|
360 |
+
.b8 51
|
361 |
+
.b8 120
|
362 |
+
.b8 114
|
363 |
+
.b8 118
|
364 |
+
.b8 101
|
365 |
+
.b8 105
|
366 |
+
.b8 99
|
367 |
+
.b8 101
|
368 |
+
.b8 114
|
369 |
+
.b8 108
|
370 |
+
.b8 46
|
371 |
+
.b8 112
|
372 |
+
.b8 121
|
373 |
+
.b8 0
|
374 |
+
.b32 .debug_line
|
375 |
+
.b8 47
|
376 |
+
.b8 116
|
377 |
+
.b8 109
|
378 |
+
.b8 112
|
379 |
+
.b8 47
|
380 |
+
.b8 116
|
381 |
+
.b8 111
|
382 |
+
.b8 114
|
383 |
+
.b8 99
|
384 |
+
.b8 104
|
385 |
+
.b8 105
|
386 |
+
.b8 110
|
387 |
+
.b8 100
|
388 |
+
.b8 117
|
389 |
+
.b8 99
|
390 |
+
.b8 116
|
391 |
+
.b8 111
|
392 |
+
.b8 114
|
393 |
+
.b8 95
|
394 |
+
.b8 114
|
395 |
+
.b8 111
|
396 |
+
.b8 111
|
397 |
+
.b8 116
|
398 |
+
.b8 47
|
399 |
+
.b8 53
|
400 |
+
.b8 106
|
401 |
+
.b8 0
|
402 |
+
.b8 1
|
403 |
+
.b64 $L__func_begin0
|
404 |
+
.b64 $L__func_end0
|
405 |
+
.b8 2
|
406 |
+
.b64 $L__func_begin0
|
407 |
+
.b64 $L__func_end0
|
408 |
+
.b8 1
|
409 |
+
.b8 156
|
410 |
+
.b8 116
|
411 |
+
.b8 114
|
412 |
+
.b8 105
|
413 |
+
.b8 116
|
414 |
+
.b8 111
|
415 |
+
.b8 110
|
416 |
+
.b8 95
|
417 |
+
.b8 95
|
418 |
+
.b8 48
|
419 |
+
.b8 100
|
420 |
+
.b8 49
|
421 |
+
.b8 100
|
422 |
+
.b8 50
|
423 |
+
.b8 100
|
424 |
+
.b8 101
|
425 |
+
.b8 0
|
426 |
+
.b8 116
|
427 |
+
.b8 114
|
428 |
+
.b8 105
|
429 |
+
.b8 116
|
430 |
+
.b8 111
|
431 |
+
.b8 110
|
432 |
+
.b8 95
|
433 |
+
.b8 95
|
434 |
+
.b8 48
|
435 |
+
.b8 100
|
436 |
+
.b8 49
|
437 |
+
.b8 100
|
438 |
+
.b8 50
|
439 |
+
.b8 100
|
440 |
+
.b8 101
|
441 |
+
.b8 0
|
442 |
+
.b8 1
|
443 |
+
.b8 18
|
444 |
+
.b8 1
|
445 |
+
.b8 0
|
446 |
+
}
|
447 |
+
.section .debug_pubnames
|
448 |
+
{
|
449 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
450 |
+
$L__pubNames_start0:
|
451 |
+
.b8 2
|
452 |
+
.b8 0
|
453 |
+
.b32 .debug_info
|
454 |
+
.b32 180
|
455 |
+
.b32 125
|
456 |
+
.b8 116
|
457 |
+
.b8 114
|
458 |
+
.b8 105
|
459 |
+
.b8 116
|
460 |
+
.b8 111
|
461 |
+
.b8 110
|
462 |
+
.b8 95
|
463 |
+
.b8 95
|
464 |
+
.b8 48
|
465 |
+
.b8 100
|
466 |
+
.b8 49
|
467 |
+
.b8 100
|
468 |
+
.b8 50
|
469 |
+
.b8 100
|
470 |
+
.b8 101
|
471 |
+
.b8 0
|
472 |
+
.b32 0
|
473 |
+
$L__pubNames_end0:
|
474 |
+
}
|
475 |
+
.section .debug_pubtypes
|
476 |
+
{
|
477 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
478 |
+
$L__pubTypes_start0:
|
479 |
+
.b8 2
|
480 |
+
.b8 0
|
481 |
+
.b32 .debug_info
|
482 |
+
.b32 180
|
483 |
+
.b32 0
|
484 |
+
$L__pubTypes_end0:
|
485 |
+
}
|
486 |
+
.section .debug_loc { }
|
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.398942292> : tensor<512xf32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32, #blocked>
|
7 |
+
%cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked>
|
8 |
+
%cst_3 = arith.constant dense<0.707106769> : tensor<512xf32, #blocked>
|
9 |
+
%c512_i32 = arith.constant 512 : i32
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
12 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
13 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
|
14 |
+
%4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
|
15 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
|
16 |
+
%6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
|
17 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
|
18 |
+
%8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
|
19 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
|
20 |
+
%10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
|
21 |
+
%11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
|
22 |
+
%12 = arith.extf %11 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
|
23 |
+
%13 = arith.mulf %12, %cst_3 : tensor<512xf32, #blocked>
|
24 |
+
%14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32, #blocked>) -> tensor<512xf32, #blocked>
|
25 |
+
%15 = arith.addf %14, %cst_2 : tensor<512xf32, #blocked>
|
26 |
+
%16 = arith.mulf %15, %cst_1 : tensor<512xf32, #blocked>
|
27 |
+
%17 = arith.mulf %12, %12 : tensor<512xf32, #blocked>
|
28 |
+
%18 = arith.mulf %17, %cst_0 : tensor<512xf32, #blocked>
|
29 |
+
%19 = math.exp %18 : tensor<512xf32, #blocked>
|
30 |
+
%20 = arith.mulf %19, %cst : tensor<512xf32, #blocked>
|
31 |
+
%21 = arith.mulf %12, %20 : tensor<512xf32, #blocked>
|
32 |
+
%22 = arith.addf %16, %21 : tensor<512xf32, #blocked>
|
33 |
+
%23 = arith.mulf %8, %22 : tensor<512xf32, #blocked>
|
34 |
+
%24 = arith.truncf %23 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
|
35 |
+
tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
|
36 |
+
tt.return
|
37 |
+
}
|
38 |
+
}
|
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<256> : tensor<16x1xi64>
|
4 |
+
%cst_0 = arith.constant dense<0> : tensor<16x1xi64>
|
5 |
+
%cst_1 = arith.constant dense<512> : tensor<16x1xi64>
|
6 |
+
%cst_2 = arith.constant dense<true> : tensor<16x1xi1>
|
7 |
+
%cst_3 = arith.constant dense<256> : tensor<16x1xi32>
|
8 |
+
%cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
|
9 |
+
%cst_5 = arith.constant dense<120> : tensor<1x128xi32>
|
10 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<16x128xf32>
|
11 |
+
%c16_i32 = arith.constant 16 : i32
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = arith.muli %0, %c16_i32 : i32
|
14 |
+
%2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
|
15 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
|
16 |
+
%4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
|
17 |
+
%5 = arith.addi %4, %3 : tensor<16x1xi32>
|
18 |
+
%6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
|
19 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
|
20 |
+
%8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
|
21 |
+
%9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
|
22 |
+
%10 = tt.broadcast %5 : (tensor<16x1xi32>) -> tensor<16x128xi32>
|
23 |
+
%11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<16x128xi32>
|
24 |
+
%12 = arith.addi %10, %11 : tensor<16x128xi32>
|
25 |
+
%13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
|
26 |
+
%14 = tt.addptr %13, %12 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
|
27 |
+
%15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<16x128xi1>
|
28 |
+
%16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32>
|
29 |
+
%17 = arith.addf %16, %cst_6 : tensor<16x128xf32>
|
30 |
+
%18 = arith.select %15, %17, %cst_6 : tensor<16x128xi1>, tensor<16x128xf32>
|
31 |
+
%19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
|
32 |
+
^bb0(%arg5: f32, %arg6: f32):
|
33 |
+
%35 = arith.addf %arg5, %arg6 : f32
|
34 |
+
tt.reduce.return %35 : f32
|
35 |
+
}) : (tensor<16x128xf32>) -> tensor<16xf32>
|
36 |
+
%20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
|
37 |
+
%21 = arith.divsi %5, %cst_3 : tensor<16x1xi32>
|
38 |
+
%22 = arith.remsi %5, %cst_3 : tensor<16x1xi32>
|
39 |
+
%23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
|
40 |
+
%24 = tt.addptr %23, %21 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
|
41 |
+
%25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
|
42 |
+
%26 = arith.addi %25, %cst_1 : tensor<16x1xi64>
|
43 |
+
%27 = arith.cmpi slt, %25, %cst_0 : tensor<16x1xi64>
|
44 |
+
%28 = arith.select %27, %26, %25 : tensor<16x1xi1>, tensor<16x1xi64>
|
45 |
+
%29 = arith.muli %28, %cst : tensor<16x1xi64>
|
46 |
+
%30 = arith.extsi %22 : tensor<16x1xi32> to tensor<16x1xi64>
|
47 |
+
%31 = arith.addi %30, %29 : tensor<16x1xi64>
|
48 |
+
%32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x1x!tt.ptr<f32, 1>>
|
49 |
+
%33 = tt.addptr %32, %31 : tensor<16x1x!tt.ptr<f32, 1>>, tensor<16x1xi64>
|
50 |
+
%34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<16x1x!tt.ptr<f32, 1>>, tensor<16x1xf32>, tensor<16x1xi1>) -> tensor<16x1xf32>
|
51 |
+
tt.return
|
52 |
+
}
|
53 |
+
}
|
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.cubin
ADDED
Binary file (13.9 kB). View file
|
|
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ptx
ADDED
@@ -0,0 +1,709 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6de7de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3d4d5d6de7de(
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
|
19 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
|
20 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
|
21 |
+
)
|
22 |
+
.maxntid 64, 1, 1
|
23 |
+
{
|
24 |
+
.reg .pred %p<33>;
|
25 |
+
.reg .b16 %rs<9>;
|
26 |
+
.reg .b32 %r<106>;
|
27 |
+
.reg .f32 %f<73>;
|
28 |
+
.reg .b64 %rd<21>;
|
29 |
+
.loc 1 18 0
|
30 |
+
$L__func_begin0:
|
31 |
+
.loc 1 18 0
|
32 |
+
|
33 |
+
ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6de7de_param_0];
|
34 |
+
ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_1];
|
35 |
+
$L__tmp0:
|
36 |
+
.loc 1 26 26
|
37 |
+
mov.u32 %r72, %tid.x;
|
38 |
+
and.b32 %r73, %r72, 31;
|
39 |
+
ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6de7de_param_2];
|
40 |
+
ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6de7de_param_3];
|
41 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6de7de_param_4];
|
42 |
+
shl.b32 %r74, %r72, 2;
|
43 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_5];
|
44 |
+
and.b32 %r75, %r74, 252;
|
45 |
+
.loc 1 23 28
|
46 |
+
mov.u32 %r1, %ctaid.x;
|
47 |
+
.loc 1 30 40
|
48 |
+
shl.b32 %r76, %r1, 8;
|
49 |
+
.loc 1 30 36
|
50 |
+
or.b32 %r77, %r76, %r75;
|
51 |
+
.loc 1 30 30
|
52 |
+
mul.wide.s32 %rd17, %r77, 2;
|
53 |
+
add.s64 %rd1, %rd12, %rd17;
|
54 |
+
mov.b32 %r4, 0;
|
55 |
+
mov.pred %p1, -1;
|
56 |
+
.loc 1 30 46
|
57 |
+
mov.u32 %r2, 0x0;
|
58 |
+
mov.u32 %r3, 0x0;
|
59 |
+
@%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
|
60 |
+
@!%p1 mov.u32 %r2, %r4;
|
61 |
+
@!%p1 mov.u32 %r3, %r4;
|
62 |
+
cvt.u16.u32 %rs1, %r2;
|
63 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
64 |
+
cvt.u16.u32 %rs3, %r3;
|
65 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
66 |
+
.loc 1 30 67
|
67 |
+
cvt.f32.bf16 %r6, %rs1;
|
68 |
+
mov.b32 %f1, %r6;
|
69 |
+
cvt.f32.bf16 %r7, %rs2;
|
70 |
+
mov.b32 %f2, %r7;
|
71 |
+
cvt.f32.bf16 %r8, %rs3;
|
72 |
+
mov.b32 %f3, %r8;
|
73 |
+
cvt.f32.bf16 %r9, %rs4;
|
74 |
+
mov.b32 %f4, %r9;
|
75 |
+
.loc 1 31 30
|
76 |
+
mul.wide.u32 %rd18, %r75, 4;
|
77 |
+
add.s64 %rd2, %rd13, %rd18;
|
78 |
+
.loc 1 31 35
|
79 |
+
mov.u32 %r10, 0x0;
|
80 |
+
mov.u32 %r11, 0x0;
|
81 |
+
mov.u32 %r12, 0x0;
|
82 |
+
mov.u32 %r13, 0x0;
|
83 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
|
84 |
+
@!%p1 mov.u32 %r10, %r4;
|
85 |
+
@!%p1 mov.u32 %r11, %r4;
|
86 |
+
@!%p1 mov.u32 %r12, %r4;
|
87 |
+
@!%p1 mov.u32 %r13, %r4;
|
88 |
+
mov.b32 %f5, %r10;
|
89 |
+
mov.b32 %f6, %r11;
|
90 |
+
mov.b32 %f7, %r12;
|
91 |
+
mov.b32 %f8, %r13;
|
92 |
+
.loc 1 32 30
|
93 |
+
mul.wide.s32 %rd19, %r77, 4;
|
94 |
+
add.s64 %rd3, %rd14, %rd19;
|
95 |
+
.loc 1 32 46
|
96 |
+
mov.u32 %r18, 0x0;
|
97 |
+
mov.u32 %r19, 0x0;
|
98 |
+
mov.u32 %r20, 0x0;
|
99 |
+
mov.u32 %r21, 0x0;
|
100 |
+
@%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
|
101 |
+
@!%p1 mov.u32 %r18, %r4;
|
102 |
+
@!%p1 mov.u32 %r19, %r4;
|
103 |
+
@!%p1 mov.u32 %r20, %r4;
|
104 |
+
@!%p1 mov.u32 %r21, %r4;
|
105 |
+
mov.b32 %f9, %r18;
|
106 |
+
mov.b32 %f10, %r19;
|
107 |
+
mov.b32 %f11, %r20;
|
108 |
+
mov.b32 %f12, %r21;
|
109 |
+
.loc 1 33 35
|
110 |
+
add.s64 %rd4, %rd11, %rd19;
|
111 |
+
.loc 1 33 51
|
112 |
+
mov.u32 %r26, 0x0;
|
113 |
+
mov.u32 %r27, 0x0;
|
114 |
+
mov.u32 %r28, 0x0;
|
115 |
+
mov.u32 %r29, 0x0;
|
116 |
+
@%p1 ld.global.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
|
117 |
+
@!%p1 mov.u32 %r26, %r4;
|
118 |
+
@!%p1 mov.u32 %r27, %r4;
|
119 |
+
@!%p1 mov.u32 %r28, %r4;
|
120 |
+
@!%p1 mov.u32 %r29, %r4;
|
121 |
+
mov.b32 %f13, %r26;
|
122 |
+
mov.b32 %f14, %r27;
|
123 |
+
mov.b32 %f15, %r28;
|
124 |
+
mov.b32 %f16, %r29;
|
125 |
+
.loc 1 34 31
|
126 |
+
mul.wide.s32 %rd20, %r1, 4;
|
127 |
+
add.s64 %rd5, %rd15, %rd20;
|
128 |
+
.loc 1 34 36
|
129 |
+
mov.u32 %r51, 0x0;
|
130 |
+
@%p1 ld.global.L1::evict_last.b32 { %r51 }, [ %rd5 + 0 ];
|
131 |
+
mov.u32 %r35, 0x0;
|
132 |
+
@%p1 ld.global.L1::evict_last.b32 { %r35 }, [ %rd5 + 0 ];
|
133 |
+
mov.u32 %r36, 0x0;
|
134 |
+
@%p1 ld.global.L1::evict_last.b32 { %r36 }, [ %rd5 + 0 ];
|
135 |
+
mov.u32 %r37, 0x0;
|
136 |
+
@%p1 ld.global.L1::evict_last.b32 { %r37 }, [ %rd5 + 0 ];
|
137 |
+
.loc 1 36 18
|
138 |
+
mul.f32 %f17, %f1, %f5;
|
139 |
+
mul.f32 %f18, %f2, %f6;
|
140 |
+
mul.f32 %f19, %f3, %f7;
|
141 |
+
mul.f32 %f20, %f4, %f8;
|
142 |
+
$L__tmp1:
|
143 |
+
.loc 2 233 15
|
144 |
+
fma.rn.f32 %f21, %f1, %f5, %f18;
|
145 |
+
fma.rn.f32 %f22, %f3, %f7, %f21;
|
146 |
+
fma.rn.f32 %f23, %f4, %f8, %f22;
|
147 |
+
$L__tmp2:
|
148 |
+
.loc 2 243 36
|
149 |
+
mov.b32 %r78, %f23;
|
150 |
+
shfl.sync.bfly.b32 %r79, %r78, 16, 31, -1;
|
151 |
+
mov.b32 %f24, %r79;
|
152 |
+
$L__tmp3:
|
153 |
+
.loc 2 233 15
|
154 |
+
add.f32 %f25, %f23, %f24;
|
155 |
+
$L__tmp4:
|
156 |
+
.loc 2 243 36
|
157 |
+
mov.b32 %r80, %f25;
|
158 |
+
shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1;
|
159 |
+
mov.b32 %f26, %r81;
|
160 |
+
$L__tmp5:
|
161 |
+
.loc 2 233 15
|
162 |
+
add.f32 %f27, %f25, %f26;
|
163 |
+
$L__tmp6:
|
164 |
+
.loc 2 243 36
|
165 |
+
mov.b32 %r82, %f27;
|
166 |
+
shfl.sync.bfly.b32 %r83, %r82, 4, 31, -1;
|
167 |
+
mov.b32 %f28, %r83;
|
168 |
+
$L__tmp7:
|
169 |
+
.loc 2 233 15
|
170 |
+
add.f32 %f29, %f27, %f28;
|
171 |
+
$L__tmp8:
|
172 |
+
.loc 2 243 36
|
173 |
+
mov.b32 %r84, %f29;
|
174 |
+
shfl.sync.bfly.b32 %r85, %r84, 2, 31, -1;
|
175 |
+
mov.b32 %f30, %r85;
|
176 |
+
$L__tmp9:
|
177 |
+
.loc 2 233 15
|
178 |
+
add.f32 %f31, %f29, %f30;
|
179 |
+
$L__tmp10:
|
180 |
+
.loc 2 243 36
|
181 |
+
mov.b32 %r86, %f31;
|
182 |
+
shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
|
183 |
+
mov.b32 %f32, %r87;
|
184 |
+
$L__tmp11:
|
185 |
+
.loc 2 233 15
|
186 |
+
add.f32 %f33, %f31, %f32;
|
187 |
+
$L__tmp12:
|
188 |
+
.loc 2 243 36
|
189 |
+
setp.eq.s32 %p23, %r73, 0;
|
190 |
+
shr.u32 %r88, %r72, 3;
|
191 |
+
and.b32 %r89, %r88, 4;
|
192 |
+
mov.u32 %r90, global_smem;
|
193 |
+
add.s32 %r38, %r90, %r89;
|
194 |
+
mov.b32 %r39, %f33;
|
195 |
+
@%p23 st.shared.b32 [ %r38 + 0 ], %r39;
|
196 |
+
bar.sync 0;
|
197 |
+
setp.lt.s32 %p24, %r72, 2;
|
198 |
+
add.s32 %r41, %r90, %r74;
|
199 |
+
@%p24 ld.shared.b32 %r40, [ %r41 + 0 ];
|
200 |
+
mov.b32 %f34, %r40;
|
201 |
+
shfl.sync.bfly.b32 %r91, %r40, 1, 31, -1;
|
202 |
+
mov.b32 %f35, %r91;
|
203 |
+
$L__tmp13:
|
204 |
+
.loc 2 233 15
|
205 |
+
add.f32 %f36, %f34, %f35;
|
206 |
+
$L__tmp14:
|
207 |
+
.loc 2 243 36
|
208 |
+
and.b32 %r92, %r72, 1;
|
209 |
+
setp.eq.b32 %p31, %r92, 1;
|
210 |
+
not.pred %p32, %p31;
|
211 |
+
and.pred %p25, %p24, %p32;
|
212 |
+
mov.b32 %r43, %f36;
|
213 |
+
@%p25 st.shared.b32 [ %r41 + 0 ], %r43;
|
214 |
+
bar.sync 0;
|
215 |
+
ld.shared.f32 %f37, [global_smem];
|
216 |
+
$L__tmp15:
|
217 |
+
.loc 3 8 15
|
218 |
+
add.f32 %f38, %f37, 0f00000000;
|
219 |
+
$L__tmp16:
|
220 |
+
.loc 1 40 18
|
221 |
+
mul.f32 %f39, %f18, %f10;
|
222 |
+
$L__tmp17:
|
223 |
+
.loc 2 243 36
|
224 |
+
bar.sync 0;
|
225 |
+
$L__tmp18:
|
226 |
+
.loc 2 233 15
|
227 |
+
fma.rn.f32 %f40, %f17, %f9, %f39;
|
228 |
+
fma.rn.f32 %f41, %f19, %f11, %f40;
|
229 |
+
fma.rn.f32 %f42, %f20, %f12, %f41;
|
230 |
+
$L__tmp19:
|
231 |
+
.loc 2 243 36
|
232 |
+
mov.b32 %r93, %f42;
|
233 |
+
shfl.sync.bfly.b32 %r94, %r93, 16, 31, -1;
|
234 |
+
mov.b32 %f43, %r94;
|
235 |
+
$L__tmp20:
|
236 |
+
.loc 2 233 15
|
237 |
+
add.f32 %f44, %f42, %f43;
|
238 |
+
$L__tmp21:
|
239 |
+
.loc 2 243 36
|
240 |
+
mov.b32 %r95, %f44;
|
241 |
+
shfl.sync.bfly.b32 %r96, %r95, 8, 31, -1;
|
242 |
+
mov.b32 %f45, %r96;
|
243 |
+
$L__tmp22:
|
244 |
+
.loc 2 233 15
|
245 |
+
add.f32 %f46, %f44, %f45;
|
246 |
+
$L__tmp23:
|
247 |
+
.loc 2 243 36
|
248 |
+
mov.b32 %r97, %f46;
|
249 |
+
shfl.sync.bfly.b32 %r98, %r97, 4, 31, -1;
|
250 |
+
mov.b32 %f47, %r98;
|
251 |
+
$L__tmp24:
|
252 |
+
.loc 2 233 15
|
253 |
+
add.f32 %f48, %f46, %f47;
|
254 |
+
$L__tmp25:
|
255 |
+
.loc 2 243 36
|
256 |
+
mov.b32 %r99, %f48;
|
257 |
+
shfl.sync.bfly.b32 %r100, %r99, 2, 31, -1;
|
258 |
+
mov.b32 %f49, %r100;
|
259 |
+
$L__tmp26:
|
260 |
+
.loc 2 233 15
|
261 |
+
add.f32 %f50, %f48, %f49;
|
262 |
+
$L__tmp27:
|
263 |
+
.loc 2 243 36
|
264 |
+
mov.b32 %r101, %f50;
|
265 |
+
shfl.sync.bfly.b32 %r102, %r101, 1, 31, -1;
|
266 |
+
mov.b32 %f51, %r102;
|
267 |
+
$L__tmp28:
|
268 |
+
.loc 2 233 15
|
269 |
+
add.f32 %f52, %f50, %f51;
|
270 |
+
$L__tmp29:
|
271 |
+
.loc 2 243 36
|
272 |
+
mov.b32 %r45, %f52;
|
273 |
+
@%p23 st.shared.b32 [ %r38 + 0 ], %r45;
|
274 |
+
bar.sync 0;
|
275 |
+
@%p24 ld.shared.b32 %r46, [ %r41 + 0 ];
|
276 |
+
mov.b32 %f53, %r46;
|
277 |
+
shfl.sync.bfly.b32 %r103, %r46, 1, 31, -1;
|
278 |
+
mov.b32 %f54, %r103;
|
279 |
+
$L__tmp30:
|
280 |
+
.loc 2 233 15
|
281 |
+
add.f32 %f55, %f53, %f54;
|
282 |
+
$L__tmp31:
|
283 |
+
.loc 2 243 36
|
284 |
+
mov.b32 %r49, %f55;
|
285 |
+
@%p25 st.shared.b32 [ %r41 + 0 ], %r49;
|
286 |
+
bar.sync 0;
|
287 |
+
ld.shared.f32 %f56, [global_smem];
|
288 |
+
$L__tmp32:
|
289 |
+
.loc 3 8 15
|
290 |
+
add.f32 %f57, %f56, 0f00000000;
|
291 |
+
mov.b32 %r52, 1132462080;
|
292 |
+
$L__tmp33:
|
293 |
+
.loc 1 45 20
|
294 |
+
div.full.f32 %r50, %r51, %r52;
|
295 |
+
mov.b32 %f58, %r50;
|
296 |
+
.loc 1 47 20
|
297 |
+
neg.f32 %f59, %f38;
|
298 |
+
fma.rn.f32 %f60, %f17, 0f43800000, %f59;
|
299 |
+
fma.rn.f32 %f61, %f18, 0f43800000, %f59;
|
300 |
+
fma.rn.f32 %f62, %f19, 0f43800000, %f59;
|
301 |
+
fma.rn.f32 %f63, %f20, 0f43800000, %f59;
|
302 |
+
.loc 1 49 20
|
303 |
+
neg.f32 %f64, %f57;
|
304 |
+
fma.rn.f32 %f65, %f64, %f9, %f60;
|
305 |
+
fma.rn.f32 %f66, %f64, %f10, %f61;
|
306 |
+
fma.rn.f32 %f67, %f64, %f11, %f62;
|
307 |
+
fma.rn.f32 %f68, %f64, %f12, %f63;
|
308 |
+
.loc 1 51 20
|
309 |
+
fma.rn.f32 %f69, %f58, %f65, %f13;
|
310 |
+
fma.rn.f32 %f70, %f58, %f66, %f14;
|
311 |
+
fma.rn.f32 %f71, %f58, %f67, %f15;
|
312 |
+
fma.rn.f32 %f72, %f58, %f68, %f16;
|
313 |
+
.loc 1 53 51
|
314 |
+
mov.b32 %r62, %f69;
|
315 |
+
mov.b32 %r63, %f70;
|
316 |
+
mov.b32 %r64, %f71;
|
317 |
+
mov.b32 %r65, %f72;
|
318 |
+
@%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r62, %r63, %r64, %r65 };
|
319 |
+
.loc 1 54 25
|
320 |
+
add.s64 %rd10, %rd16, %rd17;
|
321 |
+
.loc 1 54 48
|
322 |
+
cvt.rn.bf16.f32 %rs5, %r62;
|
323 |
+
cvt.rn.bf16.f32 %rs6, %r63;
|
324 |
+
cvt.rn.bf16.f32 %rs7, %r64;
|
325 |
+
cvt.rn.bf16.f32 %rs8, %r65;
|
326 |
+
mov.b32 %r104, {%rs5, %rs6};
|
327 |
+
mov.b32 %r105, {%rs7, %rs8};
|
328 |
+
@%p1 st.global.v2.b32 [ %rd10 + 0 ], { %r104, %r105 };
|
329 |
+
.loc 1 54 4
|
330 |
+
ret;
|
331 |
+
$L__tmp34:
|
332 |
+
$L__func_end0:
|
333 |
+
|
334 |
+
}
|
335 |
+
.file 1 "/tmp/torchinductor_root/rn/crnynbmsd2yell2lpjymb46rttfaea2xjwsbxr75j54gctfgi457.py"
|
336 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
337 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
338 |
+
.section .debug_abbrev
|
339 |
+
{
|
340 |
+
.b8 1
|
341 |
+
.b8 17
|
342 |
+
.b8 1
|
343 |
+
.b8 37
|
344 |
+
.b8 8
|
345 |
+
.b8 19
|
346 |
+
.b8 5
|
347 |
+
.b8 3
|
348 |
+
.b8 8
|
349 |
+
.b8 16
|
350 |
+
.b8 6
|
351 |
+
.b8 27
|
352 |
+
.b8 8
|
353 |
+
.b8 180
|
354 |
+
.b8 66
|
355 |
+
.b8 12
|
356 |
+
.b8 17
|
357 |
+
.b8 1
|
358 |
+
.b8 18
|
359 |
+
.b8 1
|
360 |
+
.b8 0
|
361 |
+
.b8 0
|
362 |
+
.b8 2
|
363 |
+
.b8 46
|
364 |
+
.b8 0
|
365 |
+
.b8 135
|
366 |
+
.b8 64
|
367 |
+
.b8 8
|
368 |
+
.b8 3
|
369 |
+
.b8 8
|
370 |
+
.b8 58
|
371 |
+
.b8 11
|
372 |
+
.b8 59
|
373 |
+
.b8 11
|
374 |
+
.b8 63
|
375 |
+
.b8 12
|
376 |
+
.b8 32
|
377 |
+
.b8 11
|
378 |
+
.b8 0
|
379 |
+
.b8 0
|
380 |
+
.b8 3
|
381 |
+
.b8 46
|
382 |
+
.b8 1
|
383 |
+
.b8 17
|
384 |
+
.b8 1
|
385 |
+
.b8 18
|
386 |
+
.b8 1
|
387 |
+
.b8 64
|
388 |
+
.b8 10
|
389 |
+
.b8 49
|
390 |
+
.b8 19
|
391 |
+
.b8 0
|
392 |
+
.b8 0
|
393 |
+
.b8 4
|
394 |
+
.b8 29
|
395 |
+
.b8 1
|
396 |
+
.b8 49
|
397 |
+
.b8 19
|
398 |
+
.b8 17
|
399 |
+
.b8 1
|
400 |
+
.b8 18
|
401 |
+
.b8 1
|
402 |
+
.b8 88
|
403 |
+
.b8 11
|
404 |
+
.b8 89
|
405 |
+
.b8 11
|
406 |
+
.b8 87
|
407 |
+
.b8 11
|
408 |
+
.b8 0
|
409 |
+
.b8 0
|
410 |
+
.b8 5
|
411 |
+
.b8 29
|
412 |
+
.b8 0
|
413 |
+
.b8 49
|
414 |
+
.b8 19
|
415 |
+
.b8 17
|
416 |
+
.b8 1
|
417 |
+
.b8 18
|
418 |
+
.b8 1
|
419 |
+
.b8 88
|
420 |
+
.b8 11
|
421 |
+
.b8 89
|
422 |
+
.b8 11
|
423 |
+
.b8 87
|
424 |
+
.b8 11
|
425 |
+
.b8 0
|
426 |
+
.b8 0
|
427 |
+
.b8 0
|
428 |
+
}
|
429 |
+
.section .debug_info
|
430 |
+
{
|
431 |
+
.b32 399
|
432 |
+
.b8 2
|
433 |
+
.b8 0
|
434 |
+
.b32 .debug_abbrev
|
435 |
+
.b8 8
|
436 |
+
.b8 1
|
437 |
+
.b8 116
|
438 |
+
.b8 114
|
439 |
+
.b8 105
|
440 |
+
.b8 116
|
441 |
+
.b8 111
|
442 |
+
.b8 110
|
443 |
+
.b8 0
|
444 |
+
.b8 2
|
445 |
+
.b8 0
|
446 |
+
.b8 99
|
447 |
+
.b8 114
|
448 |
+
.b8 110
|
449 |
+
.b8 121
|
450 |
+
.b8 110
|
451 |
+
.b8 98
|
452 |
+
.b8 109
|
453 |
+
.b8 115
|
454 |
+
.b8 100
|
455 |
+
.b8 50
|
456 |
+
.b8 121
|
457 |
+
.b8 101
|
458 |
+
.b8 108
|
459 |
+
.b8 108
|
460 |
+
.b8 50
|
461 |
+
.b8 108
|
462 |
+
.b8 112
|
463 |
+
.b8 106
|
464 |
+
.b8 121
|
465 |
+
.b8 109
|
466 |
+
.b8 98
|
467 |
+
.b8 52
|
468 |
+
.b8 54
|
469 |
+
.b8 114
|
470 |
+
.b8 116
|
471 |
+
.b8 116
|
472 |
+
.b8 102
|
473 |
+
.b8 97
|
474 |
+
.b8 101
|
475 |
+
.b8 97
|
476 |
+
.b8 50
|
477 |
+
.b8 120
|
478 |
+
.b8 106
|
479 |
+
.b8 119
|
480 |
+
.b8 115
|
481 |
+
.b8 98
|
482 |
+
.b8 120
|
483 |
+
.b8 114
|
484 |
+
.b8 55
|
485 |
+
.b8 53
|
486 |
+
.b8 106
|
487 |
+
.b8 53
|
488 |
+
.b8 52
|
489 |
+
.b8 103
|
490 |
+
.b8 99
|
491 |
+
.b8 116
|
492 |
+
.b8 102
|
493 |
+
.b8 103
|
494 |
+
.b8 105
|
495 |
+
.b8 52
|
496 |
+
.b8 53
|
497 |
+
.b8 55
|
498 |
+
.b8 46
|
499 |
+
.b8 112
|
500 |
+
.b8 121
|
501 |
+
.b8 0
|
502 |
+
.b32 .debug_line
|
503 |
+
.b8 47
|
504 |
+
.b8 116
|
505 |
+
.b8 109
|
506 |
+
.b8 112
|
507 |
+
.b8 47
|
508 |
+
.b8 116
|
509 |
+
.b8 111
|
510 |
+
.b8 114
|
511 |
+
.b8 99
|
512 |
+
.b8 104
|
513 |
+
.b8 105
|
514 |
+
.b8 110
|
515 |
+
.b8 100
|
516 |
+
.b8 117
|
517 |
+
.b8 99
|
518 |
+
.b8 116
|
519 |
+
.b8 111
|
520 |
+
.b8 114
|
521 |
+
.b8 95
|
522 |
+
.b8 114
|
523 |
+
.b8 111
|
524 |
+
.b8 111
|
525 |
+
.b8 116
|
526 |
+
.b8 47
|
527 |
+
.b8 114
|
528 |
+
.b8 110
|
529 |
+
.b8 0
|
530 |
+
.b8 1
|
531 |
+
.b64 $L__func_begin0
|
532 |
+
.b64 $L__func_end0
|
533 |
+
.b8 2
|
534 |
+
.b8 116
|
535 |
+
.b8 114
|
536 |
+
.b8 105
|
537 |
+
.b8 116
|
538 |
+
.b8 111
|
539 |
+
.b8 110
|
540 |
+
.b8 95
|
541 |
+
.b8 95
|
542 |
+
.b8 48
|
543 |
+
.b8 100
|
544 |
+
.b8 49
|
545 |
+
.b8 100
|
546 |
+
.b8 50
|
547 |
+
.b8 100
|
548 |
+
.b8 51
|
549 |
+
.b8 100
|
550 |
+
.b8 52
|
551 |
+
.b8 100
|
552 |
+
.b8 53
|
553 |
+
.b8 100
|
554 |
+
.b8 54
|
555 |
+
.b8 100
|
556 |
+
.b8 101
|
557 |
+
.b8 55
|
558 |
+
.b8 100
|
559 |
+
.b8 101
|
560 |
+
.b8 0
|
561 |
+
.b8 116
|
562 |
+
.b8 114
|
563 |
+
.b8 105
|
564 |
+
.b8 116
|
565 |
+
.b8 111
|
566 |
+
.b8 110
|
567 |
+
.b8 95
|
568 |
+
.b8 95
|
569 |
+
.b8 48
|
570 |
+
.b8 100
|
571 |
+
.b8 49
|
572 |
+
.b8 100
|
573 |
+
.b8 50
|
574 |
+
.b8 100
|
575 |
+
.b8 51
|
576 |
+
.b8 100
|
577 |
+
.b8 52
|
578 |
+
.b8 100
|
579 |
+
.b8 53
|
580 |
+
.b8 100
|
581 |
+
.b8 54
|
582 |
+
.b8 100
|
583 |
+
.b8 101
|
584 |
+
.b8 55
|
585 |
+
.b8 100
|
586 |
+
.b8 101
|
587 |
+
.b8 0
|
588 |
+
.b8 1
|
589 |
+
.b8 18
|
590 |
+
.b8 1
|
591 |
+
.b8 1
|
592 |
+
.b8 3
|
593 |
+
.b64 $L__func_begin0
|
594 |
+
.b64 $L__func_end0
|
595 |
+
.b8 1
|
596 |
+
.b8 156
|
597 |
+
.b32 125
|
598 |
+
.b8 4
|
599 |
+
.b32 125
|
600 |
+
.b64 $L__tmp1
|
601 |
+
.b64 $L__tmp14
|
602 |
+
.b8 2
|
603 |
+
.b8 39
|
604 |
+
.b8 57
|
605 |
+
.b8 5
|
606 |
+
.b32 125
|
607 |
+
.b64 $L__tmp1
|
608 |
+
.b64 $L__tmp14
|
609 |
+
.b8 2
|
610 |
+
.b8 243
|
611 |
+
.b8 36
|
612 |
+
.b8 0
|
613 |
+
.b8 5
|
614 |
+
.b32 125
|
615 |
+
.b64 $L__tmp2
|
616 |
+
.b64 $L__tmp15
|
617 |
+
.b8 2
|
618 |
+
.b8 39
|
619 |
+
.b8 57
|
620 |
+
.b8 5
|
621 |
+
.b32 125
|
622 |
+
.b64 $L__tmp15
|
623 |
+
.b64 $L__tmp16
|
624 |
+
.b8 3
|
625 |
+
.b8 39
|
626 |
+
.b8 44
|
627 |
+
.b8 5
|
628 |
+
.b32 125
|
629 |
+
.b64 $L__tmp17
|
630 |
+
.b64 $L__tmp32
|
631 |
+
.b8 2
|
632 |
+
.b8 43
|
633 |
+
.b8 59
|
634 |
+
.b8 4
|
635 |
+
.b32 125
|
636 |
+
.b64 $L__tmp18
|
637 |
+
.b64 $L__tmp31
|
638 |
+
.b8 2
|
639 |
+
.b8 43
|
640 |
+
.b8 59
|
641 |
+
.b8 5
|
642 |
+
.b32 125
|
643 |
+
.b64 $L__tmp18
|
644 |
+
.b64 $L__tmp31
|
645 |
+
.b8 2
|
646 |
+
.b8 243
|
647 |
+
.b8 36
|
648 |
+
.b8 0
|
649 |
+
.b8 5
|
650 |
+
.b32 125
|
651 |
+
.b64 $L__tmp32
|
652 |
+
.b64 $L__tmp33
|
653 |
+
.b8 3
|
654 |
+
.b8 43
|
655 |
+
.b8 45
|
656 |
+
.b8 0
|
657 |
+
.b8 0
|
658 |
+
}
|
659 |
+
.section .debug_pubnames
|
660 |
+
{
|
661 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
662 |
+
$L__pubNames_start0:
|
663 |
+
.b8 2
|
664 |
+
.b8 0
|
665 |
+
.b32 .debug_info
|
666 |
+
.b32 403
|
667 |
+
.b32 125
|
668 |
+
.b8 116
|
669 |
+
.b8 114
|
670 |
+
.b8 105
|
671 |
+
.b8 116
|
672 |
+
.b8 111
|
673 |
+
.b8 110
|
674 |
+
.b8 95
|
675 |
+
.b8 95
|
676 |
+
.b8 48
|
677 |
+
.b8 100
|
678 |
+
.b8 49
|
679 |
+
.b8 100
|
680 |
+
.b8 50
|
681 |
+
.b8 100
|
682 |
+
.b8 51
|
683 |
+
.b8 100
|
684 |
+
.b8 52
|
685 |
+
.b8 100
|
686 |
+
.b8 53
|
687 |
+
.b8 100
|
688 |
+
.b8 54
|
689 |
+
.b8 100
|
690 |
+
.b8 101
|
691 |
+
.b8 55
|
692 |
+
.b8 100
|
693 |
+
.b8 101
|
694 |
+
.b8 0
|
695 |
+
.b32 0
|
696 |
+
$L__pubNames_end0:
|
697 |
+
}
|
698 |
+
.section .debug_pubtypes
|
699 |
+
{
|
700 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
701 |
+
$L__pubTypes_start0:
|
702 |
+
.b8 2
|
703 |
+
.b8 0
|
704 |
+
.b32 .debug_info
|
705 |
+
.b32 403
|
706 |
+
.b32 0
|
707 |
+
$L__pubTypes_end0:
|
708 |
+
}
|
709 |
+
.section .debug_loc { }
|
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant 0.000000e+00 : f32
|
7 |
+
%c256_i32 = arith.constant 256 : i32
|
8 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
20 |
+
%9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
21 |
+
%10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
22 |
+
%11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
23 |
+
%12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
27 |
+
%16 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
28 |
+
%17 = tt.addptr %16, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
29 |
+
%18 = tt.load %17, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
30 |
+
%19 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
|
31 |
+
%20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
|
32 |
+
%21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
|
33 |
+
%22 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
|
34 |
+
%23 = arith.select %2, %22, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
35 |
+
%24 = "tt.reduce"(%23) <{axis = 0 : i32}> ({
|
36 |
+
^bb0(%arg8: f32, %arg9: f32):
|
37 |
+
%43 = arith.addf %arg8, %arg9 : f32
|
38 |
+
tt.reduce.return %43 : f32
|
39 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
40 |
+
%25 = arith.addf %24, %cst_1 : f32
|
41 |
+
%26 = arith.mulf %22, %15 : tensor<256xf32, #blocked>
|
42 |
+
%27 = arith.select %2, %26, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
43 |
+
%28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
|
44 |
+
^bb0(%arg8: f32, %arg9: f32):
|
45 |
+
%43 = arith.addf %arg8, %arg9 : f32
|
46 |
+
tt.reduce.return %43 : f32
|
47 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
48 |
+
%29 = arith.addf %28, %cst_1 : f32
|
49 |
+
%30 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked>
|
50 |
+
%31 = arith.mulf %22, %cst_3 : tensor<256xf32, #blocked>
|
51 |
+
%32 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
|
52 |
+
%33 = arith.subf %31, %32 : tensor<256xf32, #blocked>
|
53 |
+
%34 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked>
|
54 |
+
%35 = arith.mulf %15, %34 : tensor<256xf32, #blocked>
|
55 |
+
%36 = arith.subf %33, %35 : tensor<256xf32, #blocked>
|
56 |
+
%37 = tt.broadcast %30 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
57 |
+
%38 = arith.mulf %37, %36 : tensor<256xf32, #blocked>
|
58 |
+
%39 = arith.addf %18, %38 : tensor<256xf32, #blocked>
|
59 |
+
tt.store %17, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
60 |
+
%40 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
61 |
+
%41 = tt.addptr %40, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
62 |
+
%42 = arith.truncf %39 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
63 |
+
tt.store %41, %42, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
64 |
+
tt.return
|
65 |
+
}
|
66 |
+
}
|
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin
ADDED
Binary file (4.65 kB). View file
|
|
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
|
4 |
+
%c1024_i32 = arith.constant 1024 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
10 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
11 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
|
12 |
+
tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
|
13 |
+
tt.return
|
14 |
+
}
|
15 |
+
}
|
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin
ADDED
Binary file (28.6 kB). View file
|
|
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.398942292> : tensor<1024xf32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked>
|
7 |
+
%cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked>
|
8 |
+
%cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked>
|
9 |
+
%c1024_i32 = arith.constant 1024 : i32
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
12 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
13 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
14 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
|
15 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
16 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
17 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
18 |
+
%8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
|
19 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
20 |
+
%10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
21 |
+
%11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
22 |
+
%12 = arith.extf %11 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
|
23 |
+
%13 = arith.mulf %12, %cst_3 : tensor<1024xf32, #blocked>
|
24 |
+
%14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
|
25 |
+
%15 = arith.addf %14, %cst_2 : tensor<1024xf32, #blocked>
|
26 |
+
%16 = arith.mulf %15, %cst_1 : tensor<1024xf32, #blocked>
|
27 |
+
%17 = arith.mulf %12, %12 : tensor<1024xf32, #blocked>
|
28 |
+
%18 = arith.mulf %17, %cst_0 : tensor<1024xf32, #blocked>
|
29 |
+
%19 = math.exp %18 : tensor<1024xf32, #blocked>
|
30 |
+
%20 = arith.mulf %19, %cst : tensor<1024xf32, #blocked>
|
31 |
+
%21 = arith.mulf %12, %20 : tensor<1024xf32, #blocked>
|
32 |
+
%22 = arith.addf %16, %21 : tensor<1024xf32, #blocked>
|
33 |
+
%23 = arith.mulf %8, %22 : tensor<1024xf32, #blocked>
|
34 |
+
%24 = arith.truncf %23 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
|
35 |
+
tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
|
36 |
+
tt.return
|
37 |
+
}
|
38 |
+
}
|
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.398942292> : tensor<1024xf32>
|
4 |
+
%cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32>
|
5 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
|
6 |
+
%cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32>
|
7 |
+
%cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32>
|
8 |
+
%c1024_i32 = arith.constant 1024 : i32
|
9 |
+
%0 = tt.get_program_id x : i32
|
10 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
11 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
12 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
13 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
14 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
15 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
16 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
17 |
+
%8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
|
18 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
19 |
+
%10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
20 |
+
%11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
21 |
+
%12 = arith.extf %11 : tensor<1024xbf16> to tensor<1024xf32>
|
22 |
+
%13 = arith.mulf %12, %cst_3 : tensor<1024xf32>
|
23 |
+
%14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
|
24 |
+
%15 = arith.addf %14, %cst_2 : tensor<1024xf32>
|
25 |
+
%16 = arith.mulf %15, %cst_1 : tensor<1024xf32>
|
26 |
+
%17 = arith.mulf %12, %12 : tensor<1024xf32>
|
27 |
+
%18 = arith.mulf %17, %cst_0 : tensor<1024xf32>
|
28 |
+
%19 = math.exp %18 : tensor<1024xf32>
|
29 |
+
%20 = arith.mulf %19, %cst : tensor<1024xf32>
|
30 |
+
%21 = arith.mulf %12, %20 : tensor<1024xf32>
|
31 |
+
%22 = arith.addf %16, %21 : tensor<1024xf32>
|
32 |
+
%23 = arith.mulf %8, %22 : tensor<1024xf32>
|
33 |
+
%24 = arith.truncf %23 : tensor<1024xf32> to tensor<1024xbf16>
|
34 |
+
tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
|
35 |
+
tt.return
|
36 |
+
}
|
37 |
+
}
|
.triton/dump/962d1809855a53123762906133b1d960/triton_.ttir
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
|
4 |
+
%cst_0 = arith.constant dense<12865792> : tensor<1024xi32>
|
5 |
+
%c1024_i32 = arith.constant 1024 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
11 |
+
%5 = arith.cmpi slt, %4, %cst_0 : tensor<1024xi32>
|
12 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
13 |
+
%7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
|
14 |
+
tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
|
15 |
+
tt.return
|
16 |
+
}
|
17 |
+
}
|
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3d4d5d6d7d8d9d10de11de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, i32 %10, i32 %11) local_unnamed_addr !dbg !5 {
|
7 |
+
%13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%14 = and i32 %13, 31, !dbg !8
|
9 |
+
%15 = lshr i32 %13, 5, !dbg !8
|
10 |
+
%16 = shl i32 %13, 2, !dbg !8
|
11 |
+
%17 = and i32 %16, 60, !dbg !8
|
12 |
+
%18 = and i32 %15, 3, !dbg !8
|
13 |
+
%19 = lshr i32 %14, 1, !dbg !8
|
14 |
+
%20 = shl nuw nsw i32 %18, 4, !dbg !8
|
15 |
+
%21 = or i32 %20, %19, !dbg !8
|
16 |
+
%22 = and i32 %16, 4, !dbg !9
|
17 |
+
%23 = lshr i32 %14, 4, !dbg !9
|
18 |
+
%24 = shl nuw nsw i32 %18, 1, !dbg !9
|
19 |
+
%25 = or i32 %24, %23, !dbg !9
|
20 |
+
%26 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
|
21 |
+
%27 = shl i32 %26, 6, !dbg !11
|
22 |
+
%28 = or i32 %27, %17, !dbg !12
|
23 |
+
%29 = or i32 %27, %21, !dbg !12
|
24 |
+
%.frozen = freeze i32 %28
|
25 |
+
%30 = sdiv i32 %.frozen, 256, !dbg !13
|
26 |
+
%31 = mul i32 %30, 256
|
27 |
+
%.decomposed = sub i32 %.frozen, %31
|
28 |
+
%32 = sdiv i32 %29, 256, !dbg !13
|
29 |
+
%33 = shl i32 %30, 15, !dbg !14
|
30 |
+
%34 = shl nsw i32 %32, 7, !dbg !15
|
31 |
+
%35 = add i32 %33, %.decomposed
|
32 |
+
%36 = mul nuw nsw i32 %17, 12
|
33 |
+
%37 = or i32 %25, %36
|
34 |
+
%38 = zext nneg i32 %37 to i64
|
35 |
+
%39 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %38
|
36 |
+
%40 = or i32 %36, 12
|
37 |
+
%41 = add nuw nsw i32 %40, %25
|
38 |
+
%42 = zext nneg i32 %41 to i64
|
39 |
+
%43 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %42
|
40 |
+
%44 = add nuw nsw i32 %36, 24
|
41 |
+
%45 = or i32 %44, %25
|
42 |
+
%46 = zext nneg i32 %45 to i64
|
43 |
+
%47 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %46
|
44 |
+
%48 = add nuw nsw i32 %36, 36
|
45 |
+
%49 = add nuw nsw i32 %48, %25
|
46 |
+
%50 = zext nneg i32 %49 to i64
|
47 |
+
%51 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %50
|
48 |
+
%52 = mul nuw nsw i32 %21, 12
|
49 |
+
%53 = add nuw nsw i32 %52, %22
|
50 |
+
%54 = zext nneg i32 %53 to i64
|
51 |
+
%55 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %54
|
52 |
+
%56 = getelementptr float, ptr addrspace(3) @global_smem, i64 %38
|
53 |
+
%57 = getelementptr float, ptr addrspace(3) @global_smem, i64 %42
|
54 |
+
%58 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46
|
55 |
+
%59 = getelementptr float, ptr addrspace(3) @global_smem, i64 %50
|
56 |
+
%60 = getelementptr float, ptr addrspace(3) @global_smem, i64 %54
|
57 |
+
%61 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 1
|
58 |
+
%62 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 2
|
59 |
+
%63 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 3
|
60 |
+
br label %64, !dbg !16
|
61 |
+
|
62 |
+
64: ; preds = %12, %64
|
63 |
+
%65 = phi i32 [ 0, %12 ], [ %205, %64 ]
|
64 |
+
%66 = phi <8 x float> [ zeroinitializer, %12 ], [ %204, %64 ]
|
65 |
+
%67 = or i32 %65, %22, !dbg !17
|
66 |
+
%68 = or i32 %65, %25, !dbg !17
|
67 |
+
%69 = shl i32 %68, 8, !dbg !18
|
68 |
+
%70 = add i32 %35, %69, !dbg !19
|
69 |
+
%71 = sext i32 %70 to i64, !dbg !20
|
70 |
+
%72 = getelementptr i16, ptr addrspace(1) %0, i64 %71, !dbg !20
|
71 |
+
%73 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %72, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
|
72 |
+
%74 = extractvalue { i32, i32 } %73, 0, !dbg !21
|
73 |
+
%75 = extractvalue { i32, i32 } %73, 1, !dbg !21
|
74 |
+
%76 = trunc i32 %74 to i16, !dbg !21
|
75 |
+
%extelt.offset = lshr i32 %74, 16, !dbg !21
|
76 |
+
%77 = trunc i32 %extelt.offset to i16, !dbg !21
|
77 |
+
%78 = trunc i32 %75 to i16, !dbg !21
|
78 |
+
%extelt.offset1 = lshr i32 %75, 16, !dbg !21
|
79 |
+
%79 = trunc i32 %extelt.offset1 to i16, !dbg !21
|
80 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !22
|
81 |
+
%80 = insertelement <1 x i16> undef, i16 %76, i64 0, !dbg !22
|
82 |
+
store <1 x i16> %80, ptr addrspace(3) %39, align 2, !dbg !22
|
83 |
+
%81 = insertelement <1 x i16> undef, i16 %77, i64 0, !dbg !22
|
84 |
+
store <1 x i16> %81, ptr addrspace(3) %43, align 2, !dbg !22
|
85 |
+
%82 = insertelement <1 x i16> undef, i16 %78, i64 0, !dbg !22
|
86 |
+
store <1 x i16> %82, ptr addrspace(3) %47, align 2, !dbg !22
|
87 |
+
%83 = insertelement <1 x i16> undef, i16 %79, i64 0, !dbg !22
|
88 |
+
store <1 x i16> %83, ptr addrspace(3) %51, align 2, !dbg !22
|
89 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !22
|
90 |
+
%84 = load i16, ptr addrspace(3) %55, align 8, !dbg !22
|
91 |
+
%85 = load i16, ptr addrspace(3) %61, align 2, !dbg !22
|
92 |
+
%86 = load i16, ptr addrspace(3) %62, align 4, !dbg !22
|
93 |
+
%87 = load i16, ptr addrspace(3) %63, align 2, !dbg !22
|
94 |
+
%88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #3, !dbg !22
|
95 |
+
%89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #3, !dbg !22
|
96 |
+
%90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #3, !dbg !22
|
97 |
+
%91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %87) #3, !dbg !22
|
98 |
+
%92 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !23
|
99 |
+
%93 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %92, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24
|
100 |
+
%94 = extractvalue { i32, i32, i32, i32 } %93, 0, !dbg !24
|
101 |
+
%95 = extractvalue { i32, i32, i32, i32 } %93, 1, !dbg !24
|
102 |
+
%96 = extractvalue { i32, i32, i32, i32 } %93, 2, !dbg !24
|
103 |
+
%97 = extractvalue { i32, i32, i32, i32 } %93, 3, !dbg !24
|
104 |
+
%98 = bitcast i32 %94 to float, !dbg !24
|
105 |
+
%99 = bitcast i32 %95 to float, !dbg !24
|
106 |
+
%100 = bitcast i32 %96 to float, !dbg !24
|
107 |
+
%101 = bitcast i32 %97 to float, !dbg !24
|
108 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !24
|
109 |
+
%102 = insertelement <1 x float> undef, float %98, i64 0, !dbg !24
|
110 |
+
store <1 x float> %102, ptr addrspace(3) %56, align 4, !dbg !24
|
111 |
+
%103 = insertelement <1 x float> undef, float %99, i64 0, !dbg !24
|
112 |
+
store <1 x float> %103, ptr addrspace(3) %57, align 4, !dbg !24
|
113 |
+
%104 = insertelement <1 x float> undef, float %100, i64 0, !dbg !24
|
114 |
+
store <1 x float> %104, ptr addrspace(3) %58, align 4, !dbg !24
|
115 |
+
%105 = insertelement <1 x float> undef, float %101, i64 0, !dbg !24
|
116 |
+
store <1 x float> %105, ptr addrspace(3) %59, align 4, !dbg !24
|
117 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !24
|
118 |
+
%106 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !24
|
119 |
+
%107 = getelementptr i16, ptr addrspace(1) %2, i64 %71, !dbg !25
|
120 |
+
%108 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !26
|
121 |
+
%109 = extractvalue { i32, i32 } %108, 0, !dbg !26
|
122 |
+
%110 = extractvalue { i32, i32 } %108, 1, !dbg !26
|
123 |
+
%111 = trunc i32 %109 to i16, !dbg !26
|
124 |
+
%extelt.offset2 = lshr i32 %109, 16, !dbg !26
|
125 |
+
%112 = trunc i32 %extelt.offset2 to i16, !dbg !26
|
126 |
+
%113 = trunc i32 %110 to i16, !dbg !26
|
127 |
+
%extelt.offset3 = lshr i32 %110, 16, !dbg !26
|
128 |
+
%114 = trunc i32 %extelt.offset3 to i16, !dbg !26
|
129 |
+
%115 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #3, !dbg !27
|
130 |
+
%116 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #3, !dbg !27
|
131 |
+
%117 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #3, !dbg !27
|
132 |
+
%118 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #3, !dbg !27
|
133 |
+
%119 = add i32 %67, %34, !dbg !28
|
134 |
+
%120 = sext i32 %119 to i64, !dbg !29
|
135 |
+
%121 = getelementptr float, ptr addrspace(1) %3, i64 %120, !dbg !29
|
136 |
+
%122 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %121, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !30
|
137 |
+
%123 = extractvalue { i32, i32, i32, i32 } %122, 0, !dbg !30
|
138 |
+
%124 = extractvalue { i32, i32, i32, i32 } %122, 1, !dbg !30
|
139 |
+
%125 = extractvalue { i32, i32, i32, i32 } %122, 2, !dbg !30
|
140 |
+
%126 = extractvalue { i32, i32, i32, i32 } %122, 3, !dbg !30
|
141 |
+
%127 = getelementptr float, ptr addrspace(1) %4, i64 %120, !dbg !31
|
142 |
+
%128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %127, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !32
|
143 |
+
%129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !32
|
144 |
+
%130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !32
|
145 |
+
%131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !32
|
146 |
+
%132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !32
|
147 |
+
%133 = getelementptr i16, ptr addrspace(1) %5, i64 %71, !dbg !33
|
148 |
+
%134 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %133, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !34
|
149 |
+
%135 = extractvalue { i32, i32 } %134, 0, !dbg !34
|
150 |
+
%136 = extractvalue { i32, i32 } %134, 1, !dbg !34
|
151 |
+
%137 = trunc i32 %135 to i16, !dbg !34
|
152 |
+
%extelt.offset4 = lshr i32 %135, 16, !dbg !34
|
153 |
+
%138 = trunc i32 %extelt.offset4 to i16, !dbg !34
|
154 |
+
%139 = trunc i32 %136 to i16, !dbg !34
|
155 |
+
%extelt.offset5 = lshr i32 %136, 16, !dbg !34
|
156 |
+
%140 = trunc i32 %extelt.offset5 to i16, !dbg !34
|
157 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !35
|
158 |
+
%141 = insertelement <1 x i16> undef, i16 %137, i64 0, !dbg !35
|
159 |
+
store <1 x i16> %141, ptr addrspace(3) %39, align 2, !dbg !35
|
160 |
+
%142 = insertelement <1 x i16> undef, i16 %138, i64 0, !dbg !35
|
161 |
+
store <1 x i16> %142, ptr addrspace(3) %43, align 2, !dbg !35
|
162 |
+
%143 = insertelement <1 x i16> undef, i16 %139, i64 0, !dbg !35
|
163 |
+
store <1 x i16> %143, ptr addrspace(3) %47, align 2, !dbg !35
|
164 |
+
%144 = insertelement <1 x i16> undef, i16 %140, i64 0, !dbg !35
|
165 |
+
store <1 x i16> %144, ptr addrspace(3) %51, align 2, !dbg !35
|
166 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !35
|
167 |
+
%145 = load i16, ptr addrspace(3) %55, align 8, !dbg !35
|
168 |
+
%146 = load i16, ptr addrspace(3) %61, align 2, !dbg !35
|
169 |
+
%147 = load i16, ptr addrspace(3) %62, align 4, !dbg !35
|
170 |
+
%148 = load i16, ptr addrspace(3) %63, align 2, !dbg !35
|
171 |
+
%149 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %145) #3, !dbg !35
|
172 |
+
%150 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %146) #3, !dbg !35
|
173 |
+
%151 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %147) #3, !dbg !35
|
174 |
+
%152 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %148) #3, !dbg !35
|
175 |
+
%153 = getelementptr float, ptr addrspace(1) %6, i64 %120, !dbg !36
|
176 |
+
%154 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %153, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !37
|
177 |
+
%155 = extractvalue { i32, i32, i32, i32 } %154, 0, !dbg !37
|
178 |
+
%156 = extractvalue { i32, i32, i32, i32 } %154, 1, !dbg !37
|
179 |
+
%157 = extractvalue { i32, i32, i32, i32 } %154, 2, !dbg !37
|
180 |
+
%158 = extractvalue { i32, i32, i32, i32 } %154, 3, !dbg !37
|
181 |
+
%159 = getelementptr float, ptr addrspace(1) %7, i64 %120, !dbg !38
|
182 |
+
%160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %159, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !39
|
183 |
+
%161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !39
|
184 |
+
%162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !39
|
185 |
+
%163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !39
|
186 |
+
%164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !39
|
187 |
+
%165 = fadd float %115, %98, !dbg !40
|
188 |
+
%166 = fadd float %116, %99, !dbg !40
|
189 |
+
%167 = fadd float %117, %100, !dbg !40
|
190 |
+
%168 = fadd float %118, %101, !dbg !40
|
191 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !40
|
192 |
+
%169 = insertelement <1 x float> undef, float %165, i64 0, !dbg !40
|
193 |
+
store <1 x float> %169, ptr addrspace(3) %56, align 4, !dbg !40
|
194 |
+
%170 = insertelement <1 x float> undef, float %166, i64 0, !dbg !40
|
195 |
+
store <1 x float> %170, ptr addrspace(3) %57, align 4, !dbg !40
|
196 |
+
%171 = insertelement <1 x float> undef, float %167, i64 0, !dbg !40
|
197 |
+
store <1 x float> %171, ptr addrspace(3) %58, align 4, !dbg !40
|
198 |
+
%172 = insertelement <1 x float> undef, float %168, i64 0, !dbg !40
|
199 |
+
store <1 x float> %172, ptr addrspace(3) %59, align 4, !dbg !40
|
200 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !40
|
201 |
+
%173 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !40
|
202 |
+
%174 = insertelement <8 x i32> poison, i32 %155, i64 0, !dbg !37
|
203 |
+
%175 = insertelement <8 x i32> %174, i32 %156, i64 1, !dbg !37
|
204 |
+
%176 = insertelement <8 x i32> %175, i32 %157, i64 2, !dbg !37
|
205 |
+
%177 = insertelement <8 x i32> %176, i32 %158, i64 3, !dbg !37
|
206 |
+
%178 = insertelement <8 x i32> %177, i32 %123, i64 4, !dbg !37
|
207 |
+
%179 = insertelement <8 x i32> %178, i32 %124, i64 5, !dbg !37
|
208 |
+
%180 = insertelement <8 x i32> %179, i32 %125, i64 6, !dbg !37
|
209 |
+
%181 = insertelement <8 x i32> %180, i32 %126, i64 7, !dbg !37
|
210 |
+
%182 = bitcast <8 x i32> %181 to <8 x float>, !dbg !37
|
211 |
+
%183 = insertelement <8 x i32> poison, i32 %161, i64 0, !dbg !39
|
212 |
+
%184 = insertelement <8 x i32> %183, i32 %162, i64 1, !dbg !39
|
213 |
+
%185 = insertelement <8 x i32> %184, i32 %163, i64 2, !dbg !39
|
214 |
+
%186 = insertelement <8 x i32> %185, i32 %164, i64 3, !dbg !39
|
215 |
+
%187 = insertelement <8 x i32> %186, i32 %129, i64 4, !dbg !39
|
216 |
+
%188 = insertelement <8 x i32> %187, i32 %130, i64 5, !dbg !39
|
217 |
+
%189 = insertelement <8 x i32> %188, i32 %131, i64 6, !dbg !39
|
218 |
+
%190 = insertelement <8 x i32> %189, i32 %132, i64 7, !dbg !39
|
219 |
+
%191 = bitcast <8 x i32> %190 to <8 x float>, !dbg !39
|
220 |
+
%192 = shufflevector <4 x float> %106, <4 x float> %173, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, !dbg !41
|
221 |
+
%193 = fsub <8 x float> %192, %182, !dbg !41
|
222 |
+
%194 = fmul <8 x float> %193, %191, !dbg !42
|
223 |
+
%195 = insertelement <8 x float> poison, float %149, i64 0, !dbg !43
|
224 |
+
%196 = insertelement <8 x float> %195, float %150, i64 1, !dbg !43
|
225 |
+
%197 = insertelement <8 x float> %196, float %151, i64 2, !dbg !43
|
226 |
+
%198 = insertelement <8 x float> %197, float %152, i64 3, !dbg !43
|
227 |
+
%199 = insertelement <8 x float> %198, float %88, i64 4, !dbg !43
|
228 |
+
%200 = insertelement <8 x float> %199, float %89, i64 5, !dbg !43
|
229 |
+
%201 = insertelement <8 x float> %200, float %90, i64 6, !dbg !43
|
230 |
+
%202 = insertelement <8 x float> %201, float %91, i64 7, !dbg !43
|
231 |
+
%203 = fmul <8 x float> %202, %194, !dbg !43
|
232 |
+
%204 = fadd <8 x float> %66, %203, !dbg !44
|
233 |
+
%205 = add nuw nsw i32 %65, 8, !dbg !16
|
234 |
+
%206 = icmp ult i32 %65, 120, !dbg !16
|
235 |
+
br i1 %206, label %64, label %207, !dbg !16
|
236 |
+
|
237 |
+
207: ; preds = %64
|
238 |
+
%208 = and i32 %13, 63, !dbg !8
|
239 |
+
%209 = or i32 %27, %208, !dbg !12
|
240 |
+
%shift = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>, !dbg !45
|
241 |
+
%210 = fadd <8 x float> %204, %shift, !dbg !45
|
242 |
+
%shift28 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 6, i32 poison, i32 poison, i32 poison>, !dbg !45
|
243 |
+
%211 = fadd <8 x float> %shift28, %210, !dbg !45
|
244 |
+
%shift29 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison, i32 poison, i32 poison>, !dbg !45
|
245 |
+
%212 = fadd <8 x float> %shift29, %211, !dbg !45
|
246 |
+
%213 = extractelement <8 x float> %212, i64 4, !dbg !45
|
247 |
+
%214 = bitcast float %213 to i32, !dbg !51
|
248 |
+
%215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !51
|
249 |
+
%216 = bitcast i32 %215 to float, !dbg !51
|
250 |
+
%217 = fadd float %213, %216, !dbg !45
|
251 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !53
|
252 |
+
%218 = zext nneg i32 %21 to i64, !dbg !53
|
253 |
+
%219 = getelementptr float, ptr addrspace(3) @global_smem, i64 %218, !dbg !53
|
254 |
+
%220 = insertelement <1 x float> undef, float %217, i64 0, !dbg !53
|
255 |
+
store <1 x float> %220, ptr addrspace(3) %219, align 4, !dbg !53
|
256 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !53
|
257 |
+
%221 = zext nneg i32 %208 to i64, !dbg !53
|
258 |
+
%222 = getelementptr float, ptr addrspace(3) @global_smem, i64 %221, !dbg !53
|
259 |
+
%223 = load i32, ptr addrspace(3) %222, align 4, !dbg !53
|
260 |
+
%224 = sext i32 %209 to i64, !dbg !54
|
261 |
+
%225 = getelementptr float, ptr addrspace(1) %8, i64 %224, !dbg !54
|
262 |
+
%226 = and i32 %13, 64, !dbg !55
|
263 |
+
%227 = icmp eq i32 %226, 0, !dbg !55
|
264 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %223, ptr addrspace(1) %225, i1 %227) #3, !dbg !55
|
265 |
+
%shift30 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
|
266 |
+
%228 = fadd <8 x float> %204, %shift30, !dbg !56
|
267 |
+
%shift31 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
|
268 |
+
%229 = fadd <8 x float> %shift31, %228, !dbg !56
|
269 |
+
%shift32 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
|
270 |
+
%230 = fadd <8 x float> %shift32, %229, !dbg !56
|
271 |
+
%231 = extractelement <8 x float> %230, i64 0, !dbg !56
|
272 |
+
%232 = bitcast float %231 to i32, !dbg !59
|
273 |
+
%233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !59
|
274 |
+
%234 = bitcast i32 %233 to float, !dbg !59
|
275 |
+
%235 = fadd float %231, %234, !dbg !56
|
276 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !61
|
277 |
+
%236 = insertelement <1 x float> undef, float %235, i64 0, !dbg !61
|
278 |
+
store <1 x float> %236, ptr addrspace(3) %219, align 4, !dbg !61
|
279 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !61
|
280 |
+
%237 = load i32, ptr addrspace(3) %222, align 4, !dbg !61
|
281 |
+
%238 = getelementptr float, ptr addrspace(1) %9, i64 %224, !dbg !62
|
282 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %237, ptr addrspace(1) %238, i1 %227) #3, !dbg !63
|
283 |
+
ret void, !dbg !64
|
284 |
+
}
|
285 |
+
|
286 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
287 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
288 |
+
|
289 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
290 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
291 |
+
|
292 |
+
; Function Attrs: convergent nocallback nounwind
|
293 |
+
declare void @llvm.nvvm.barrier0() #2
|
294 |
+
|
295 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
296 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
297 |
+
attributes #2 = { convergent nocallback nounwind }
|
298 |
+
attributes #3 = { nounwind }
|
299 |
+
|
300 |
+
!llvm.module.flags = !{!0}
|
301 |
+
!llvm.dbg.cu = !{!1}
|
302 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
303 |
+
|
304 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
305 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
306 |
+
!2 = !DIFile(filename: "c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py", directory: "/tmp/torchinductor_root/3x")
|
307 |
+
!3 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"kernel", i32 1}
|
308 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"maxntidx", i32 128}
|
309 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", linkageName: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
310 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
311 |
+
!7 = !{}
|
312 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
313 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
314 |
+
!10 = !DILocation(line: 21, column: 28, scope: !5)
|
315 |
+
!11 = !DILocation(line: 21, column: 33, scope: !5)
|
316 |
+
!12 = !DILocation(line: 22, column: 23, scope: !5)
|
317 |
+
!13 = !DILocation(line: 26, column: 20, scope: !5)
|
318 |
+
!14 = !DILocation(line: 34, column: 57, scope: !5)
|
319 |
+
!15 = !DILocation(line: 37, column: 44, scope: !5)
|
320 |
+
!16 = !DILocation(line: 30, column: 36, scope: !5)
|
321 |
+
!17 = !DILocation(line: 31, column: 27, scope: !5)
|
322 |
+
!18 = !DILocation(line: 34, column: 44, scope: !5)
|
323 |
+
!19 = !DILocation(line: 34, column: 51, scope: !5)
|
324 |
+
!20 = !DILocation(line: 34, column: 34, scope: !5)
|
325 |
+
!21 = !DILocation(line: 34, column: 63, scope: !5)
|
326 |
+
!22 = !DILocation(line: 34, column: 115, scope: !5)
|
327 |
+
!23 = !DILocation(line: 35, column: 34, scope: !5)
|
328 |
+
!24 = !DILocation(line: 35, column: 63, scope: !5)
|
329 |
+
!25 = !DILocation(line: 36, column: 34, scope: !5)
|
330 |
+
!26 = !DILocation(line: 36, column: 63, scope: !5)
|
331 |
+
!27 = !DILocation(line: 36, column: 115, scope: !5)
|
332 |
+
!28 = !DILocation(line: 37, column: 40, scope: !5)
|
333 |
+
!29 = !DILocation(line: 37, column: 34, scope: !5)
|
334 |
+
!30 = !DILocation(line: 37, column: 50, scope: !5)
|
335 |
+
!31 = !DILocation(line: 38, column: 34, scope: !5)
|
336 |
+
!32 = !DILocation(line: 38, column: 50, scope: !5)
|
337 |
+
!33 = !DILocation(line: 39, column: 35, scope: !5)
|
338 |
+
!34 = !DILocation(line: 39, column: 64, scope: !5)
|
339 |
+
!35 = !DILocation(line: 39, column: 116, scope: !5)
|
340 |
+
!36 = !DILocation(line: 40, column: 35, scope: !5)
|
341 |
+
!37 = !DILocation(line: 40, column: 51, scope: !5)
|
342 |
+
!38 = !DILocation(line: 41, column: 35, scope: !5)
|
343 |
+
!39 = !DILocation(line: 41, column: 51, scope: !5)
|
344 |
+
!40 = !DILocation(line: 44, column: 22, scope: !5)
|
345 |
+
!41 = !DILocation(line: 52, column: 23, scope: !5)
|
346 |
+
!42 = !DILocation(line: 53, column: 24, scope: !5)
|
347 |
+
!43 = !DILocation(line: 54, column: 24, scope: !5)
|
348 |
+
!44 = !DILocation(line: 57, column: 40, scope: !5)
|
349 |
+
!45 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !49)
|
350 |
+
!46 = distinct !DILexicalBlockFile(scope: !48, file: !47, discriminator: 0)
|
351 |
+
!47 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
352 |
+
!48 = distinct !DILexicalBlockFile(scope: !5, file: !47, discriminator: 0)
|
353 |
+
!49 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !50)
|
354 |
+
!50 = !DILocation(line: 58, column: 27, scope: !46)
|
355 |
+
!51 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !52)
|
356 |
+
!52 = !DILocation(line: 58, column: 27, scope: !48)
|
357 |
+
!53 = !DILocation(line: 58, column: 30, scope: !5)
|
358 |
+
!54 = !DILocation(line: 59, column: 25, scope: !5)
|
359 |
+
!55 = !DILocation(line: 59, column: 37, scope: !5)
|
360 |
+
!56 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !57)
|
361 |
+
!57 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !58)
|
362 |
+
!58 = !DILocation(line: 60, column: 27, scope: !46)
|
363 |
+
!59 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !60)
|
364 |
+
!60 = !DILocation(line: 60, column: 27, scope: !48)
|
365 |
+
!61 = !DILocation(line: 60, column: 30, scope: !5)
|
366 |
+
!62 = !DILocation(line: 61, column: 25, scope: !5)
|
367 |
+
!63 = !DILocation(line: 61, column: 37, scope: !5)
|
368 |
+
!64 = !DILocation(line: 61, column: 4, scope: !5)
|
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked1>
|
8 |
+
%cst_1 = arith.constant dense<128> : tensor<64x1xi32, #blocked1>
|
9 |
+
%cst_2 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<128> : tensor<1x8xi32, #blocked1>
|
12 |
+
%cst_5 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
|
13 |
+
%c0_i32 = arith.constant 0 : i32
|
14 |
+
%c128_i32 = arith.constant 128 : i32
|
15 |
+
%c8_i32 = arith.constant 8 : i32
|
16 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1>
|
17 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
|
18 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
|
19 |
+
%c64_i32 = arith.constant 64 : i32
|
20 |
+
%0 = tt.get_program_id x : i32
|
21 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
22 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
23 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
24 |
+
%4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
|
25 |
+
%5 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
26 |
+
%6 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
27 |
+
%7 = tt.expand_dims %4 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xi32, #blocked2>
|
28 |
+
%8 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
29 |
+
%9 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
30 |
+
%10 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked2>
|
31 |
+
%11 = arith.addi %8, %5 : tensor<64x1xi32, #blocked>
|
32 |
+
%12 = arith.addi %9, %6 : tensor<64x1xi32, #blocked1>
|
33 |
+
%13 = arith.addi %10, %7 : tensor<64x1xi32, #blocked2>
|
34 |
+
%14 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
|
35 |
+
%15 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
36 |
+
%16 = tt.expand_dims %14 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x8xi32, #blocked1>
|
37 |
+
%17 = tt.expand_dims %15 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
|
38 |
+
%18 = arith.remsi %11, %cst : tensor<64x1xi32, #blocked>
|
39 |
+
%19 = arith.divsi %11, %cst : tensor<64x1xi32, #blocked>
|
40 |
+
%20 = arith.divsi %12, %cst_0 : tensor<64x1xi32, #blocked1>
|
41 |
+
%21 = tt.broadcast %18 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
42 |
+
%22 = arith.muli %19, %cst_2 : tensor<64x1xi32, #blocked>
|
43 |
+
%23 = tt.broadcast %22 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
44 |
+
%24 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
45 |
+
%25 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
46 |
+
%26 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
47 |
+
%27 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked1>
|
48 |
+
%28 = tt.broadcast %27 : (tensor<64x1xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
|
49 |
+
%29 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
|
50 |
+
%30 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
|
51 |
+
%31 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
52 |
+
%32 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
|
53 |
+
%33 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
|
54 |
+
%34:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_6, %arg14 = %cst_6) -> (tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>) : i32 {
|
55 |
+
%45 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked1>
|
56 |
+
%46 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked>
|
57 |
+
%47 = arith.addi %45, %16 : tensor<1x8xi32, #blocked1>
|
58 |
+
%48 = arith.addi %46, %17 : tensor<1x8xi32, #blocked>
|
59 |
+
%49 = arith.cmpi slt, %47, %cst_4 : tensor<1x8xi32, #blocked1>
|
60 |
+
%50 = arith.cmpi slt, %48, %cst_5 : tensor<1x8xi32, #blocked>
|
61 |
+
%51 = arith.muli %48, %cst_3 : tensor<1x8xi32, #blocked>
|
62 |
+
%52 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
63 |
+
%53 = arith.addi %21, %52 : tensor<64x8xi32, #blocked>
|
64 |
+
%54 = arith.addi %53, %23 : tensor<64x8xi32, #blocked>
|
65 |
+
%55 = tt.addptr %24, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
66 |
+
%56 = tt.broadcast %49 : (tensor<1x8xi1, #blocked1>) -> tensor<64x8xi1, #blocked1>
|
67 |
+
%57 = tt.broadcast %50 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
|
68 |
+
%58 = tt.load %55, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
|
69 |
+
%59 = triton_gpu.convert_layout %58 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1>
|
70 |
+
%60 = arith.extf %59 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1>
|
71 |
+
%61 = tt.addptr %25, %54 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
72 |
+
%62 = tt.load %61, %57, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
73 |
+
%63 = triton_gpu.convert_layout %62 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1>
|
74 |
+
%64 = tt.addptr %26, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
75 |
+
%65 = tt.load %64, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
|
76 |
+
%66 = arith.extf %65 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
|
77 |
+
%67 = tt.broadcast %47 : (tensor<1x8xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
|
78 |
+
%68 = arith.addi %67, %28 : tensor<64x8xi32, #blocked1>
|
79 |
+
%69 = tt.addptr %29, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
|
80 |
+
%70 = tt.load %69, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
|
81 |
+
%71 = tt.addptr %30, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
|
82 |
+
%72 = tt.load %71, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
|
83 |
+
%73 = tt.addptr %31, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
84 |
+
%74 = tt.load %73, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
|
85 |
+
%75 = triton_gpu.convert_layout %74 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1>
|
86 |
+
%76 = arith.extf %75 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1>
|
87 |
+
%77 = tt.addptr %32, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
|
88 |
+
%78 = tt.load %77, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
|
89 |
+
%79 = tt.addptr %33, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
|
90 |
+
%80 = tt.load %79, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
|
91 |
+
%81 = arith.addf %62, %66 : tensor<64x8xf32, #blocked>
|
92 |
+
%82 = triton_gpu.convert_layout %81 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1>
|
93 |
+
%83 = arith.subf %82, %70 : tensor<64x8xf32, #blocked1>
|
94 |
+
%84 = arith.mulf %83, %72 : tensor<64x8xf32, #blocked1>
|
95 |
+
%85 = arith.mulf %60, %84 : tensor<64x8xf32, #blocked1>
|
96 |
+
%86 = arith.addf %arg13, %85 : tensor<64x8xf32, #blocked1>
|
97 |
+
%87 = arith.select %56, %86, %arg13 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1>
|
98 |
+
%88 = arith.subf %63, %78 : tensor<64x8xf32, #blocked1>
|
99 |
+
%89 = arith.mulf %88, %80 : tensor<64x8xf32, #blocked1>
|
100 |
+
%90 = arith.mulf %76, %89 : tensor<64x8xf32, #blocked1>
|
101 |
+
%91 = arith.addf %arg14, %90 : tensor<64x8xf32, #blocked1>
|
102 |
+
%92 = arith.select %56, %91, %arg14 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1>
|
103 |
+
scf.yield %87, %92 : tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>
|
104 |
+
}
|
105 |
+
%35 = "tt.reduce"(%34#0) <{axis = 1 : i32}> ({
|
106 |
+
^bb0(%arg12: f32, %arg13: f32):
|
107 |
+
%45 = arith.addf %arg12, %arg13 : f32
|
108 |
+
tt.reduce.return %45 : f32
|
109 |
+
}) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
110 |
+
%36 = triton_gpu.convert_layout %35 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
|
111 |
+
%37 = tt.expand_dims %36 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2>
|
112 |
+
%38 = tt.splat %arg8 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked2>
|
113 |
+
%39 = tt.addptr %38, %13 : tensor<64x1x!tt.ptr<f32, 1>, #blocked2>, tensor<64x1xi32, #blocked2>
|
114 |
+
tt.store %39, %37 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2>
|
115 |
+
%40 = "tt.reduce"(%34#1) <{axis = 1 : i32}> ({
|
116 |
+
^bb0(%arg12: f32, %arg13: f32):
|
117 |
+
%45 = arith.addf %arg12, %arg13 : f32
|
118 |
+
tt.reduce.return %45 : f32
|
119 |
+
}) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
120 |
+
%41 = triton_gpu.convert_layout %40 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
|
121 |
+
%42 = tt.expand_dims %41 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2>
|
122 |
+
%43 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked2>
|
123 |
+
%44 = tt.addptr %43, %13 : tensor<64x1x!tt.ptr<f32, 1>, #blocked2>, tensor<64x1xi32, #blocked2>
|
124 |
+
tt.store %44, %42 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2>
|
125 |
+
tt.return
|
126 |
+
}
|
127 |
+
}
|
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir
ADDED
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
6 |
+
|
7 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
|
8 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
9 |
+
%10 = and i32 %9, 31, !dbg !10
|
10 |
+
%11 = lshr i32 %9, 5, !dbg !10
|
11 |
+
%12 = and i32 %11, 1, !dbg !10
|
12 |
+
%urem = shl i32 %9, 2, !dbg !10
|
13 |
+
%13 = and i32 %urem, 252, !dbg !10
|
14 |
+
%14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
|
15 |
+
%15 = shl i32 %14, 8, !dbg !12
|
16 |
+
%16 = or i32 %15, %13, !dbg !13
|
17 |
+
%17 = sext i32 %16 to i64, !dbg !14
|
18 |
+
%18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !14
|
19 |
+
%19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
|
20 |
+
%20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !15
|
21 |
+
%21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !15
|
22 |
+
%22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !15
|
23 |
+
%23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !15
|
24 |
+
%24 = bitcast i32 %22 to float, !dbg !15
|
25 |
+
%25 = bitcast i32 %23 to float, !dbg !15
|
26 |
+
%26 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !16
|
27 |
+
%27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
|
28 |
+
%28 = extractvalue { i32, i32 } %27, 0, !dbg !17
|
29 |
+
%29 = extractvalue { i32, i32 } %27, 1, !dbg !17
|
30 |
+
%30 = trunc i32 %28 to i16, !dbg !17
|
31 |
+
%extelt.offset = lshr i32 %28, 16, !dbg !17
|
32 |
+
%31 = trunc i32 %extelt.offset to i16, !dbg !17
|
33 |
+
%32 = trunc i32 %29 to i16, !dbg !17
|
34 |
+
%extelt.offset1 = lshr i32 %29, 16, !dbg !17
|
35 |
+
%33 = trunc i32 %extelt.offset1 to i16, !dbg !17
|
36 |
+
%34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
|
37 |
+
%35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
|
38 |
+
%36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
|
39 |
+
%37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
|
40 |
+
%38 = getelementptr i16, ptr addrspace(1) %2, i64 %17, !dbg !19
|
41 |
+
%39 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %38, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
|
42 |
+
%40 = extractvalue { i32, i32 } %39, 0, !dbg !20
|
43 |
+
%41 = extractvalue { i32, i32 } %39, 1, !dbg !20
|
44 |
+
%42 = trunc i32 %40 to i16, !dbg !20
|
45 |
+
%extelt.offset2 = lshr i32 %40, 16, !dbg !20
|
46 |
+
%43 = trunc i32 %extelt.offset2 to i16, !dbg !20
|
47 |
+
%44 = trunc i32 %41 to i16, !dbg !20
|
48 |
+
%extelt.offset3 = lshr i32 %41, 16, !dbg !20
|
49 |
+
%45 = trunc i32 %extelt.offset3 to i16, !dbg !20
|
50 |
+
%46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
|
51 |
+
%47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
|
52 |
+
%48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
|
53 |
+
%49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !21
|
54 |
+
%50 = getelementptr i16, ptr addrspace(1) %3, i64 %17, !dbg !22
|
55 |
+
%51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
|
56 |
+
%52 = extractvalue { i32, i32 } %51, 0, !dbg !23
|
57 |
+
%53 = extractvalue { i32, i32 } %51, 1, !dbg !23
|
58 |
+
%54 = trunc i32 %52 to i16, !dbg !23
|
59 |
+
%extelt.offset4 = lshr i32 %52, 16, !dbg !23
|
60 |
+
%55 = trunc i32 %extelt.offset4 to i16, !dbg !23
|
61 |
+
%56 = trunc i32 %53 to i16, !dbg !23
|
62 |
+
%extelt.offset5 = lshr i32 %53, 16, !dbg !23
|
63 |
+
%57 = trunc i32 %extelt.offset5 to i16, !dbg !23
|
64 |
+
%58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !24
|
65 |
+
%59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !24
|
66 |
+
%60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !24
|
67 |
+
%61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !24
|
68 |
+
%62 = zext nneg i32 %13 to i64, !dbg !25
|
69 |
+
%63 = getelementptr float, ptr addrspace(1) %4, i64 %62, !dbg !25
|
70 |
+
%64 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %63, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
|
71 |
+
%65 = fadd float %36, %24, !dbg !27
|
72 |
+
%66 = fadd float %37, %25, !dbg !27
|
73 |
+
%67 = fadd float %65, %48, !dbg !28
|
74 |
+
%68 = fadd float %66, %49, !dbg !28
|
75 |
+
%69 = insertelement <2 x i32> poison, i32 %20, i64 0, !dbg !15
|
76 |
+
%70 = insertelement <2 x i32> %69, i32 %21, i64 1, !dbg !15
|
77 |
+
%71 = bitcast <2 x i32> %70 to <2 x float>, !dbg !15
|
78 |
+
%72 = insertelement <2 x float> poison, float %34, i64 0, !dbg !27
|
79 |
+
%73 = insertelement <2 x float> %72, float %35, i64 1, !dbg !27
|
80 |
+
%74 = fadd <2 x float> %73, %71, !dbg !27
|
81 |
+
%75 = insertelement <2 x float> poison, float %46, i64 0, !dbg !28
|
82 |
+
%76 = insertelement <2 x float> %75, float %47, i64 1, !dbg !28
|
83 |
+
%77 = fadd <2 x float> %74, %76, !dbg !28
|
84 |
+
%78 = insertelement <2 x float> poison, float %58, i64 0, !dbg !29
|
85 |
+
%79 = insertelement <2 x float> %78, float %59, i64 1, !dbg !29
|
86 |
+
%80 = fadd <2 x float> %77, %79, !dbg !29
|
87 |
+
%81 = fadd float %67, %60, !dbg !29
|
88 |
+
%82 = fadd float %68, %61, !dbg !29
|
89 |
+
%83 = extractelement <2 x float> %80, i64 0, !dbg !30
|
90 |
+
%84 = extractelement <2 x float> %80, i64 1, !dbg !30
|
91 |
+
%85 = fadd float %83, %84, !dbg !30
|
92 |
+
%86 = fadd float %85, %81, !dbg !30
|
93 |
+
%87 = fadd float %86, %82, !dbg !30
|
94 |
+
%88 = bitcast float %87 to i32, !dbg !36
|
95 |
+
%89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 16, i32 31), !dbg !36
|
96 |
+
%90 = bitcast i32 %89 to float, !dbg !36
|
97 |
+
%91 = fadd float %87, %90, !dbg !30
|
98 |
+
%92 = bitcast float %91 to i32, !dbg !36
|
99 |
+
%93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 8, i32 31), !dbg !36
|
100 |
+
%94 = bitcast i32 %93 to float, !dbg !36
|
101 |
+
%95 = fadd float %91, %94, !dbg !30
|
102 |
+
%96 = bitcast float %95 to i32, !dbg !36
|
103 |
+
%97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 4, i32 31), !dbg !36
|
104 |
+
%98 = bitcast i32 %97 to float, !dbg !36
|
105 |
+
%99 = fadd float %95, %98, !dbg !30
|
106 |
+
%100 = bitcast float %99 to i32, !dbg !36
|
107 |
+
%101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 2, i32 31), !dbg !36
|
108 |
+
%102 = bitcast i32 %101 to float, !dbg !36
|
109 |
+
%103 = fadd float %99, %102, !dbg !30
|
110 |
+
%104 = bitcast float %103 to i32, !dbg !36
|
111 |
+
%105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 1, i32 31), !dbg !36
|
112 |
+
%106 = bitcast i32 %105 to float, !dbg !36
|
113 |
+
%107 = fadd float %103, %106, !dbg !30
|
114 |
+
%108 = icmp eq i32 %10, 0, !dbg !36
|
115 |
+
%109 = zext nneg i32 %12 to i64, !dbg !36
|
116 |
+
%110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !36
|
117 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %107, i1 %108) #6, !dbg !36
|
118 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !36
|
119 |
+
%111 = icmp slt i32 %9, 2, !dbg !36
|
120 |
+
%112 = sext i32 %9 to i64, !dbg !36
|
121 |
+
%113 = getelementptr float, ptr addrspace(3) @global_smem, i64 %112, !dbg !36
|
122 |
+
%114 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !36
|
123 |
+
%115 = bitcast float %114 to i32, !dbg !36
|
124 |
+
%116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 1, i32 31), !dbg !36
|
125 |
+
%117 = bitcast i32 %116 to float, !dbg !36
|
126 |
+
%118 = fadd float %114, %117, !dbg !30
|
127 |
+
%119 = and i32 %9, 1, !dbg !36
|
128 |
+
%120 = icmp eq i32 %119, 0, !dbg !36
|
129 |
+
%121 = and i1 %111, %120, !dbg !36
|
130 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %118, i1 %121) #6, !dbg !36
|
131 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !36
|
132 |
+
%122 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !36
|
133 |
+
%123 = fadd float %122, 0.000000e+00, !dbg !38
|
134 |
+
%124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %123, float 2.560000e+02) #6, !dbg !42
|
135 |
+
%125 = fsub float %83, %124, !dbg !43
|
136 |
+
%126 = fsub float %84, %124, !dbg !43
|
137 |
+
%127 = fsub float %81, %124, !dbg !43
|
138 |
+
%128 = fsub float %82, %124, !dbg !43
|
139 |
+
%129 = fmul float %125, %125, !dbg !44
|
140 |
+
%130 = fmul float %126, %126, !dbg !44
|
141 |
+
%131 = fmul float %127, %127, !dbg !44
|
142 |
+
%132 = fmul float %128, %128, !dbg !44
|
143 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !45
|
144 |
+
%133 = fadd float %129, %130, !dbg !47
|
145 |
+
%134 = fadd float %131, %133, !dbg !47
|
146 |
+
%135 = fadd float %132, %134, !dbg !47
|
147 |
+
%136 = bitcast float %135 to i32, !dbg !45
|
148 |
+
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !45
|
149 |
+
%138 = bitcast i32 %137 to float, !dbg !45
|
150 |
+
%139 = fadd float %135, %138, !dbg !47
|
151 |
+
%140 = bitcast float %139 to i32, !dbg !45
|
152 |
+
%141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !45
|
153 |
+
%142 = bitcast i32 %141 to float, !dbg !45
|
154 |
+
%143 = fadd float %139, %142, !dbg !47
|
155 |
+
%144 = bitcast float %143 to i32, !dbg !45
|
156 |
+
%145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !45
|
157 |
+
%146 = bitcast i32 %145 to float, !dbg !45
|
158 |
+
%147 = fadd float %143, %146, !dbg !47
|
159 |
+
%148 = bitcast float %147 to i32, !dbg !45
|
160 |
+
%149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !45
|
161 |
+
%150 = bitcast i32 %149 to float, !dbg !45
|
162 |
+
%151 = fadd float %147, %150, !dbg !47
|
163 |
+
%152 = bitcast float %151 to i32, !dbg !45
|
164 |
+
%153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !45
|
165 |
+
%154 = bitcast i32 %153 to float, !dbg !45
|
166 |
+
%155 = fadd float %151, %154, !dbg !47
|
167 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %155, i1 %108) #6, !dbg !45
|
168 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !45
|
169 |
+
%156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !45
|
170 |
+
%157 = bitcast float %156 to i32, !dbg !45
|
171 |
+
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !45
|
172 |
+
%159 = bitcast i32 %158 to float, !dbg !45
|
173 |
+
%160 = fadd float %156, %159, !dbg !47
|
174 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %160, i1 %121) #6, !dbg !45
|
175 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !45
|
176 |
+
%161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !45
|
177 |
+
%162 = fadd float %161, 0.000000e+00, !dbg !50
|
178 |
+
%163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float 2.560000e+02) #6, !dbg !52
|
179 |
+
%164 = fadd float %163, 0x3EE4F8B580000000, !dbg !53
|
180 |
+
%165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !54
|
181 |
+
%.not.i = icmp eq i32 %165, 0, !dbg !54
|
182 |
+
br i1 %.not.i, label %168, label %166, !dbg !54
|
183 |
+
|
184 |
+
166: ; preds = %8
|
185 |
+
%167 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %164), !dbg !54
|
186 |
+
br label %__nv_rsqrtf.exit, !dbg !54
|
187 |
+
|
188 |
+
168: ; preds = %8
|
189 |
+
%169 = tail call float @llvm.nvvm.rsqrt.approx.f(float %164), !dbg !54
|
190 |
+
br label %__nv_rsqrtf.exit, !dbg !54
|
191 |
+
|
192 |
+
__nv_rsqrtf.exit: ; preds = %166, %168
|
193 |
+
%.0.i = phi float [ %167, %166 ], [ %169, %168 ], !dbg !54
|
194 |
+
%170 = extractvalue { i32, i32, i32, i32 } %64, 3, !dbg !26
|
195 |
+
%171 = bitcast i32 %170 to float, !dbg !26
|
196 |
+
%172 = extractvalue { i32, i32, i32, i32 } %64, 2, !dbg !26
|
197 |
+
%173 = bitcast i32 %172 to float, !dbg !26
|
198 |
+
%174 = extractvalue { i32, i32, i32, i32 } %64, 1, !dbg !26
|
199 |
+
%175 = bitcast i32 %174 to float, !dbg !26
|
200 |
+
%176 = extractvalue { i32, i32, i32, i32 } %64, 0, !dbg !26
|
201 |
+
%177 = bitcast i32 %176 to float, !dbg !26
|
202 |
+
%178 = fmul float %125, %.0.i, !dbg !55
|
203 |
+
%179 = fmul float %126, %.0.i, !dbg !55
|
204 |
+
%180 = fmul float %127, %.0.i, !dbg !55
|
205 |
+
%181 = fmul float %128, %.0.i, !dbg !55
|
206 |
+
%182 = fmul float %178, %177, !dbg !56
|
207 |
+
%183 = fmul float %179, %175, !dbg !56
|
208 |
+
%184 = fmul float %180, %173, !dbg !56
|
209 |
+
%185 = fmul float %181, %171, !dbg !56
|
210 |
+
%186 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !57
|
211 |
+
%187 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %182) #6, !dbg !58
|
212 |
+
%188 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %183) #6, !dbg !58
|
213 |
+
%189 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %184) #6, !dbg !58
|
214 |
+
%190 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %185) #6, !dbg !58
|
215 |
+
%191 = insertelement <2 x i16> undef, i16 %187, i64 0, !dbg !58
|
216 |
+
%192 = insertelement <2 x i16> %191, i16 %188, i64 1, !dbg !58
|
217 |
+
%193 = bitcast <2 x i16> %192 to i32, !dbg !58
|
218 |
+
%194 = insertelement <2 x i16> undef, i16 %189, i64 0, !dbg !58
|
219 |
+
%195 = insertelement <2 x i16> %194, i16 %190, i64 1, !dbg !58
|
220 |
+
%196 = bitcast <2 x i16> %195 to i32, !dbg !58
|
221 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %193, i32 %196, ptr addrspace(1) %186, i1 true) #6, !dbg !58
|
222 |
+
ret void, !dbg !59
|
223 |
+
}
|
224 |
+
|
225 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
226 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
227 |
+
|
228 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
229 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
230 |
+
|
231 |
+
; Function Attrs: convergent nocallback nounwind
|
232 |
+
declare void @llvm.nvvm.barrier0() #2
|
233 |
+
|
234 |
+
; Function Attrs: alwaysinline nounwind
|
235 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
236 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
237 |
+
%.not = icmp eq i32 %1, 0
|
238 |
+
br i1 %.not, label %4, label %2
|
239 |
+
|
240 |
+
2: ; preds = %0
|
241 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
242 |
+
br label %6
|
243 |
+
|
244 |
+
4: ; preds = %0
|
245 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
246 |
+
br label %6
|
247 |
+
|
248 |
+
6: ; preds = %4, %2
|
249 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
250 |
+
ret float %.0
|
251 |
+
}
|
252 |
+
|
253 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
254 |
+
|
255 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
256 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
257 |
+
|
258 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
259 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
260 |
+
|
261 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
262 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
263 |
+
attributes #2 = { convergent nocallback nounwind }
|
264 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
265 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
266 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
267 |
+
attributes #6 = { nounwind }
|
268 |
+
|
269 |
+
!llvm.module.flags = !{!0, !1}
|
270 |
+
!llvm.dbg.cu = !{!2}
|
271 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
272 |
+
!llvm.ident = !{!6}
|
273 |
+
|
274 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
275 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
276 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
277 |
+
!3 = !DIFile(filename: "c4qmi2qsgi5mnuig7w3wx5jmjnmvktjlgcv4c6q7w2vaw3bk6qzb.py", directory: "/tmp/torchinductor_root/4q")
|
278 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
279 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
|
280 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
281 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
282 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
283 |
+
!9 = !{}
|
284 |
+
!10 = !DILocation(line: 26, column: 26, scope: !7)
|
285 |
+
!11 = !DILocation(line: 23, column: 28, scope: !7)
|
286 |
+
!12 = !DILocation(line: 30, column: 40, scope: !7)
|
287 |
+
!13 = !DILocation(line: 30, column: 36, scope: !7)
|
288 |
+
!14 = !DILocation(line: 30, column: 30, scope: !7)
|
289 |
+
!15 = !DILocation(line: 30, column: 46, scope: !7)
|
290 |
+
!16 = !DILocation(line: 31, column: 30, scope: !7)
|
291 |
+
!17 = !DILocation(line: 31, column: 46, scope: !7)
|
292 |
+
!18 = !DILocation(line: 31, column: 67, scope: !7)
|
293 |
+
!19 = !DILocation(line: 32, column: 30, scope: !7)
|
294 |
+
!20 = !DILocation(line: 32, column: 46, scope: !7)
|
295 |
+
!21 = !DILocation(line: 32, column: 67, scope: !7)
|
296 |
+
!22 = !DILocation(line: 33, column: 30, scope: !7)
|
297 |
+
!23 = !DILocation(line: 33, column: 46, scope: !7)
|
298 |
+
!24 = !DILocation(line: 33, column: 67, scope: !7)
|
299 |
+
!25 = !DILocation(line: 34, column: 31, scope: !7)
|
300 |
+
!26 = !DILocation(line: 34, column: 36, scope: !7)
|
301 |
+
!27 = !DILocation(line: 36, column: 18, scope: !7)
|
302 |
+
!28 = !DILocation(line: 38, column: 18, scope: !7)
|
303 |
+
!29 = !DILocation(line: 40, column: 18, scope: !7)
|
304 |
+
!30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !34)
|
305 |
+
!31 = distinct !DILexicalBlockFile(scope: !33, file: !32, discriminator: 0)
|
306 |
+
!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
307 |
+
!33 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
|
308 |
+
!34 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !35)
|
309 |
+
!35 = !DILocation(line: 45, column: 59, scope: !31)
|
310 |
+
!36 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !37)
|
311 |
+
!37 = !DILocation(line: 45, column: 59, scope: !33)
|
312 |
+
!38 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !41)
|
313 |
+
!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
|
314 |
+
!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
315 |
+
!41 = !DILocation(line: 45, column: 45, scope: !39)
|
316 |
+
!42 = !DILocation(line: 48, column: 20, scope: !7)
|
317 |
+
!43 = !DILocation(line: 49, column: 20, scope: !7)
|
318 |
+
!44 = !DILocation(line: 50, column: 20, scope: !7)
|
319 |
+
!45 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !46)
|
320 |
+
!46 = !DILocation(line: 53, column: 59, scope: !33)
|
321 |
+
!47 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !48)
|
322 |
+
!48 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !49)
|
323 |
+
!49 = !DILocation(line: 53, column: 59, scope: !31)
|
324 |
+
!50 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !51)
|
325 |
+
!51 = !DILocation(line: 53, column: 45, scope: !39)
|
326 |
+
!52 = !DILocation(line: 56, column: 20, scope: !7)
|
327 |
+
!53 = !DILocation(line: 58, column: 20, scope: !7)
|
328 |
+
!54 = !DILocation(line: 59, column: 26, scope: !7)
|
329 |
+
!55 = !DILocation(line: 60, column: 20, scope: !7)
|
330 |
+
!56 = !DILocation(line: 61, column: 20, scope: !7)
|
331 |
+
!57 = !DILocation(line: 63, column: 25, scope: !7)
|
332 |
+
!58 = !DILocation(line: 63, column: 48, scope: !7)
|
333 |
+
!59 = !DILocation(line: 63, column: 4, scope: !7)
|
.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.cubin
ADDED
Binary file (13.1 kB). View file
|
|
.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.llir
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
|
7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%7 = and i32 %6, 31, !dbg !8
|
9 |
+
%8 = lshr i32 %6, 5, !dbg !8
|
10 |
+
%9 = shl i32 %6, 2, !dbg !8
|
11 |
+
%10 = and i32 %9, 60, !dbg !8
|
12 |
+
%11 = and i32 %8, 3, !dbg !9
|
13 |
+
%12 = lshr i32 %7, 4, !dbg !9
|
14 |
+
%13 = shl nuw nsw i32 %11, 1, !dbg !9
|
15 |
+
%14 = or i32 %13, %12, !dbg !9
|
16 |
+
%15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
|
17 |
+
%16 = shl i32 %15, 6, !dbg !11
|
18 |
+
%17 = or i32 %16, %10, !dbg !12
|
19 |
+
br label %18, !dbg !13
|
20 |
+
|
21 |
+
18: ; preds = %5, %18
|
22 |
+
%19 = phi i32 [ 0, %5 ], [ %37, %18 ]
|
23 |
+
%20 = phi <4 x float> [ zeroinitializer, %5 ], [ %36, %18 ]
|
24 |
+
%21 = or i32 %19, %14, !dbg !14
|
25 |
+
%22 = shl i32 %21, 17, !dbg !15
|
26 |
+
%23 = add i32 %17, %22, !dbg !16
|
27 |
+
%24 = sext i32 %23 to i64, !dbg !17
|
28 |
+
%25 = getelementptr float, ptr addrspace(1) %0, i64 %24, !dbg !17
|
29 |
+
%26 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
|
30 |
+
%27 = extractvalue { i32, i32, i32, i32 } %26, 0, !dbg !18
|
31 |
+
%28 = extractvalue { i32, i32, i32, i32 } %26, 1, !dbg !18
|
32 |
+
%29 = extractvalue { i32, i32, i32, i32 } %26, 2, !dbg !18
|
33 |
+
%30 = extractvalue { i32, i32, i32, i32 } %26, 3, !dbg !18
|
34 |
+
%31 = insertelement <4 x i32> poison, i32 %27, i64 0, !dbg !18
|
35 |
+
%32 = insertelement <4 x i32> %31, i32 %28, i64 1, !dbg !18
|
36 |
+
%33 = insertelement <4 x i32> %32, i32 %29, i64 2, !dbg !18
|
37 |
+
%34 = insertelement <4 x i32> %33, i32 %30, i64 3, !dbg !18
|
38 |
+
%35 = bitcast <4 x i32> %34 to <4 x float>, !dbg !18
|
39 |
+
%36 = fadd <4 x float> %20, %35, !dbg !19
|
40 |
+
%37 = add nuw nsw i32 %19, 8, !dbg !13
|
41 |
+
%38 = icmp ult i32 %19, 112, !dbg !13
|
42 |
+
br i1 %38, label %18, label %39, !dbg !13
|
43 |
+
|
44 |
+
39: ; preds = %18
|
45 |
+
%40 = and i32 %6, 63, !dbg !8
|
46 |
+
%41 = or i32 %16, %40, !dbg !12
|
47 |
+
%42 = or i32 %10, 3, !dbg !20
|
48 |
+
%43 = or i32 %10, 2, !dbg !20
|
49 |
+
%44 = or i32 %10, 1, !dbg !20
|
50 |
+
%45 = extractelement <4 x float> %36, i64 0, !dbg !20
|
51 |
+
%46 = bitcast float %45 to i32, !dbg !20
|
52 |
+
%47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !20
|
53 |
+
%48 = bitcast i32 %47 to float, !dbg !20
|
54 |
+
%49 = fadd float %45, %48, !dbg !24
|
55 |
+
%50 = extractelement <4 x float> %36, i64 1, !dbg !20
|
56 |
+
%51 = bitcast float %50 to i32, !dbg !20
|
57 |
+
%52 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %51, i32 16, i32 31), !dbg !20
|
58 |
+
%53 = bitcast i32 %52 to float, !dbg !20
|
59 |
+
%54 = fadd float %50, %53, !dbg !24
|
60 |
+
%55 = extractelement <4 x float> %36, i64 2, !dbg !20
|
61 |
+
%56 = bitcast float %55 to i32, !dbg !20
|
62 |
+
%57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 16, i32 31), !dbg !20
|
63 |
+
%58 = bitcast i32 %57 to float, !dbg !20
|
64 |
+
%59 = fadd float %55, %58, !dbg !24
|
65 |
+
%60 = extractelement <4 x float> %36, i64 3, !dbg !20
|
66 |
+
%61 = bitcast float %60 to i32, !dbg !20
|
67 |
+
%62 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %61, i32 16, i32 31), !dbg !20
|
68 |
+
%63 = bitcast i32 %62 to float, !dbg !20
|
69 |
+
%64 = fadd float %60, %63, !dbg !24
|
70 |
+
%65 = icmp ult i32 %7, 16, !dbg !20
|
71 |
+
%66 = shl nuw nsw i32 %10, 2, !dbg !20
|
72 |
+
%67 = or i32 %66, %11, !dbg !20
|
73 |
+
%68 = zext nneg i32 %67 to i64, !dbg !20
|
74 |
+
%69 = getelementptr float, ptr addrspace(3) @global_smem, i64 %68, !dbg !20
|
75 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %69, float %49, i1 %65) #3, !dbg !20
|
76 |
+
%70 = shl nuw nsw i32 %44, 2, !dbg !20
|
77 |
+
%71 = or i32 %70, %11, !dbg !20
|
78 |
+
%72 = zext nneg i32 %71 to i64, !dbg !20
|
79 |
+
%73 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !20
|
80 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %54, i1 %65) #3, !dbg !20
|
81 |
+
%74 = shl nuw nsw i32 %43, 2, !dbg !20
|
82 |
+
%75 = or i32 %74, %11, !dbg !20
|
83 |
+
%76 = zext nneg i32 %75 to i64, !dbg !20
|
84 |
+
%77 = getelementptr float, ptr addrspace(3) @global_smem, i64 %76, !dbg !20
|
85 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, float %59, i1 %65) #3, !dbg !20
|
86 |
+
%78 = shl nuw nsw i32 %42, 2, !dbg !20
|
87 |
+
%79 = or i32 %78, %11, !dbg !20
|
88 |
+
%80 = zext nneg i32 %79 to i64, !dbg !20
|
89 |
+
%81 = getelementptr float, ptr addrspace(3) @global_smem, i64 %80, !dbg !20
|
90 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %81, float %64, i1 %65) #3, !dbg !20
|
91 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !20
|
92 |
+
%82 = icmp slt i32 %6, 256, !dbg !20
|
93 |
+
%83 = sext i32 %6 to i64, !dbg !20
|
94 |
+
%84 = getelementptr float, ptr addrspace(3) @global_smem, i64 %83, !dbg !20
|
95 |
+
%85 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %84, i1 %82) #3, !dbg !20
|
96 |
+
%86 = bitcast float %85 to i32, !dbg !20
|
97 |
+
%87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 2, i32 31), !dbg !20
|
98 |
+
%88 = bitcast i32 %87 to float, !dbg !20
|
99 |
+
%89 = fadd float %85, %88, !dbg !24
|
100 |
+
%90 = bitcast float %89 to i32, !dbg !20
|
101 |
+
%91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 1, i32 31), !dbg !20
|
102 |
+
%92 = bitcast i32 %91 to float, !dbg !20
|
103 |
+
%93 = fadd float %89, %92, !dbg !24
|
104 |
+
%94 = and i32 %6, 3, !dbg !20
|
105 |
+
%95 = icmp eq i32 %94, 0, !dbg !20
|
106 |
+
%96 = and i1 %82, %95, !dbg !20
|
107 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, float %93, i1 %96) #3, !dbg !20
|
108 |
+
%97 = add i32 %6, 128, !dbg !20
|
109 |
+
%98 = sext i32 %97 to i64, !dbg !20
|
110 |
+
%99 = getelementptr float, ptr addrspace(3) @global_smem, i64 %98, !dbg !20
|
111 |
+
%100 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %99, i1 %82) #3, !dbg !20
|
112 |
+
%101 = bitcast float %100 to i32, !dbg !20
|
113 |
+
%102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 2, i32 31), !dbg !20
|
114 |
+
%103 = bitcast i32 %102 to float, !dbg !20
|
115 |
+
%104 = fadd float %100, %103, !dbg !24
|
116 |
+
%105 = bitcast float %104 to i32, !dbg !20
|
117 |
+
%106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 1, i32 31), !dbg !20
|
118 |
+
%107 = bitcast i32 %106 to float, !dbg !20
|
119 |
+
%108 = fadd float %104, %107, !dbg !24
|
120 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %99, float %108, i1 %96) #3, !dbg !20
|
121 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !20
|
122 |
+
%109 = zext nneg i32 %66 to i64, !dbg !20
|
123 |
+
%110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !20
|
124 |
+
%111 = load float, ptr addrspace(3) %110, align 4, !dbg !20
|
125 |
+
%112 = zext nneg i32 %70 to i64, !dbg !20
|
126 |
+
%113 = getelementptr float, ptr addrspace(3) @global_smem, i64 %112, !dbg !20
|
127 |
+
%114 = load float, ptr addrspace(3) %113, align 4, !dbg !20
|
128 |
+
%115 = zext nneg i32 %74 to i64, !dbg !20
|
129 |
+
%116 = getelementptr float, ptr addrspace(3) @global_smem, i64 %115, !dbg !20
|
130 |
+
%117 = load float, ptr addrspace(3) %116, align 4, !dbg !20
|
131 |
+
%118 = zext nneg i32 %78 to i64, !dbg !20
|
132 |
+
%119 = getelementptr float, ptr addrspace(3) @global_smem, i64 %118, !dbg !20
|
133 |
+
%120 = load float, ptr addrspace(3) %119, align 4, !dbg !20
|
134 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
135 |
+
%121 = zext nneg i32 %10 to i64, !dbg !28
|
136 |
+
%122 = getelementptr float, ptr addrspace(3) @global_smem, i64 %121, !dbg !28
|
137 |
+
%123 = insertelement <1 x float> undef, float %111, i64 0, !dbg !28
|
138 |
+
store <1 x float> %123, ptr addrspace(3) %122, align 4, !dbg !28
|
139 |
+
%124 = zext nneg i32 %44 to i64, !dbg !28
|
140 |
+
%125 = getelementptr float, ptr addrspace(3) @global_smem, i64 %124, !dbg !28
|
141 |
+
%126 = insertelement <1 x float> undef, float %114, i64 0, !dbg !28
|
142 |
+
store <1 x float> %126, ptr addrspace(3) %125, align 4, !dbg !28
|
143 |
+
%127 = zext nneg i32 %43 to i64, !dbg !28
|
144 |
+
%128 = getelementptr float, ptr addrspace(3) @global_smem, i64 %127, !dbg !28
|
145 |
+
%129 = insertelement <1 x float> undef, float %117, i64 0, !dbg !28
|
146 |
+
store <1 x float> %129, ptr addrspace(3) %128, align 4, !dbg !28
|
147 |
+
%130 = zext nneg i32 %42 to i64, !dbg !28
|
148 |
+
%131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !28
|
149 |
+
%132 = insertelement <1 x float> undef, float %120, i64 0, !dbg !28
|
150 |
+
store <1 x float> %132, ptr addrspace(3) %131, align 4, !dbg !28
|
151 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
152 |
+
%133 = zext nneg i32 %40 to i64, !dbg !28
|
153 |
+
%134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !28
|
154 |
+
%135 = load <1 x float>, ptr addrspace(3) %134, align 4, !dbg !28
|
155 |
+
%.frozen = freeze i32 %41
|
156 |
+
%136 = sdiv i32 %.frozen, 256, !dbg !29
|
157 |
+
%137 = mul i32 %136, 256
|
158 |
+
%.decomposed = sub i32 %.frozen, %137
|
159 |
+
%138 = sext i32 %136 to i64, !dbg !30
|
160 |
+
%139 = getelementptr i64, ptr addrspace(1) %1, i64 %138, !dbg !30
|
161 |
+
%140 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %139, i1 true) #3, !dbg !31
|
162 |
+
%141 = lshr i64 %140, 54, !dbg !32
|
163 |
+
%142 = and i64 %141, 512, !dbg !32
|
164 |
+
%143 = add i64 %142, %140, !dbg !32
|
165 |
+
%144 = shl i64 %143, 8, !dbg !33
|
166 |
+
%145 = sext i32 %.decomposed to i64, !dbg !34
|
167 |
+
%146 = getelementptr float, ptr addrspace(1) %2, i64 %144, !dbg !35
|
168 |
+
%147 = getelementptr float, ptr addrspace(1) %146, i64 %145, !dbg !35
|
169 |
+
%148 = and i32 %6, 64, !dbg !36
|
170 |
+
%149 = icmp eq i32 %148, 0, !dbg !36
|
171 |
+
%150 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %147, <1 x float> %135, i1 %149) #3, !dbg !36
|
172 |
+
ret void, !dbg !37
|
173 |
+
}
|
174 |
+
|
175 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
176 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
177 |
+
|
178 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
179 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
180 |
+
|
181 |
+
; Function Attrs: convergent nocallback nounwind
|
182 |
+
declare void @llvm.nvvm.barrier0() #2
|
183 |
+
|
184 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
185 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
186 |
+
attributes #2 = { convergent nocallback nounwind }
|
187 |
+
attributes #3 = { nounwind }
|
188 |
+
|
189 |
+
!llvm.module.flags = !{!0}
|
190 |
+
!llvm.dbg.cu = !{!1}
|
191 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
192 |
+
|
193 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
194 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
195 |
+
!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
|
196 |
+
!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
|
197 |
+
!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 128}
|
198 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
199 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
200 |
+
!7 = !{}
|
201 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
202 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
203 |
+
!10 = !DILocation(line: 21, column: 28, scope: !5)
|
204 |
+
!11 = !DILocation(line: 21, column: 33, scope: !5)
|
205 |
+
!12 = !DILocation(line: 22, column: 23, scope: !5)
|
206 |
+
!13 = !DILocation(line: 27, column: 36, scope: !5)
|
207 |
+
!14 = !DILocation(line: 28, column: 27, scope: !5)
|
208 |
+
!15 = !DILocation(line: 31, column: 47, scope: !5)
|
209 |
+
!16 = !DILocation(line: 31, column: 40, scope: !5)
|
210 |
+
!17 = !DILocation(line: 31, column: 34, scope: !5)
|
211 |
+
!18 = !DILocation(line: 31, column: 53, scope: !5)
|
212 |
+
!19 = !DILocation(line: 34, column: 38, scope: !5)
|
213 |
+
!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
|
214 |
+
!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
|
215 |
+
!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
216 |
+
!23 = !DILocation(line: 35, column: 25, scope: !21)
|
217 |
+
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
|
218 |
+
!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
|
219 |
+
!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
|
220 |
+
!27 = !DILocation(line: 35, column: 25, scope: !25)
|
221 |
+
!28 = !DILocation(line: 35, column: 28, scope: !5)
|
222 |
+
!29 = !DILocation(line: 36, column: 20, scope: !5)
|
223 |
+
!30 = !DILocation(line: 38, column: 30, scope: !5)
|
224 |
+
!31 = !DILocation(line: 38, column: 35, scope: !5)
|
225 |
+
!32 = !DILocation(line: 41, column: 32, scope: !5)
|
226 |
+
!33 = !DILocation(line: 45, column: 40, scope: !5)
|
227 |
+
!34 = !DILocation(line: 45, column: 36, scope: !5)
|
228 |
+
!35 = !DILocation(line: 45, column: 30, scope: !5)
|
229 |
+
!36 = !DILocation(line: 45, column: 55, scope: !5)
|
230 |
+
!37 = !DILocation(line: 45, column: 4, scope: !5)
|
.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ptx
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2de(
|
12 |
+
.param .u64 triton__0d1d2de_param_0,
|
13 |
+
.param .u64 triton__0d1d2de_param_1,
|
14 |
+
.param .u32 triton__0d1d2de_param_2
|
15 |
+
)
|
16 |
+
.maxntid 128, 1, 1
|
17 |
+
{
|
18 |
+
.reg .pred %p<3>;
|
19 |
+
.reg .b16 %rs<3>;
|
20 |
+
.reg .b32 %r<12>;
|
21 |
+
.reg .b64 %rd<7>;
|
22 |
+
.loc 1 18 0
|
23 |
+
$L__func_begin0:
|
24 |
+
.loc 1 18 0
|
25 |
+
|
26 |
+
ld.param.u64 %rd3, [triton__0d1d2de_param_0];
|
27 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_1];
|
28 |
+
$L__tmp0:
|
29 |
+
.loc 1 21 36
|
30 |
+
mov.u32 %r7, %tid.x;
|
31 |
+
shl.b32 %r8, %r7, 1;
|
32 |
+
and.b32 %r9, %r8, 254;
|
33 |
+
.loc 1 20 28
|
34 |
+
mov.u32 %r1, %ctaid.x;
|
35 |
+
.loc 1 20 33
|
36 |
+
shl.b32 %r10, %r1, 8;
|
37 |
+
.loc 1 21 23
|
38 |
+
or.b32 %r11, %r10, %r9;
|
39 |
+
.loc 1 24 30
|
40 |
+
mul.wide.s32 %rd5, %r11, 2;
|
41 |
+
add.s64 %rd1, %rd3, %rd5;
|
42 |
+
mov.pred %p1, -1;
|
43 |
+
.loc 1 24 35
|
44 |
+
mov.u32 %r2, 0x0;
|
45 |
+
@%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ];
|
46 |
+
cvt.u16.u32 %rs1, %r2;
|
47 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
48 |
+
.loc 1 24 44
|
49 |
+
cvt.f32.bf16 %r5, %rs1;
|
50 |
+
cvt.f32.bf16 %r6, %rs2;
|
51 |
+
.loc 1 26 25
|
52 |
+
mul.wide.s32 %rd6, %r11, 4;
|
53 |
+
add.s64 %rd2, %rd4, %rd6;
|
54 |
+
.loc 1 26 36
|
55 |
+
@%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 };
|
56 |
+
.loc 1 26 4
|
57 |
+
ret;
|
58 |
+
$L__tmp1:
|
59 |
+
$L__func_end0:
|
60 |
+
|
61 |
+
}
|
62 |
+
.file 1 "/tmp/torchinductor_root/zl/czl6nmwasl7k4ic55xowihczcooh3mhu5v6ls6w2xzqqocdc2da7.py"
|
63 |
+
.section .debug_abbrev
|
64 |
+
{
|
65 |
+
.b8 1
|
66 |
+
.b8 17
|
67 |
+
.b8 1
|
68 |
+
.b8 37
|
69 |
+
.b8 8
|
70 |
+
.b8 19
|
71 |
+
.b8 5
|
72 |
+
.b8 3
|
73 |
+
.b8 8
|
74 |
+
.b8 16
|
75 |
+
.b8 6
|
76 |
+
.b8 27
|
77 |
+
.b8 8
|
78 |
+
.b8 180
|
79 |
+
.b8 66
|
80 |
+
.b8 12
|
81 |
+
.b8 17
|
82 |
+
.b8 1
|
83 |
+
.b8 18
|
84 |
+
.b8 1
|
85 |
+
.b8 0
|
86 |
+
.b8 0
|
87 |
+
.b8 2
|
88 |
+
.b8 46
|
89 |
+
.b8 0
|
90 |
+
.b8 17
|
91 |
+
.b8 1
|
92 |
+
.b8 18
|
93 |
+
.b8 1
|
94 |
+
.b8 64
|
95 |
+
.b8 10
|
96 |
+
.b8 135
|
97 |
+
.b8 64
|
98 |
+
.b8 8
|
99 |
+
.b8 3
|
100 |
+
.b8 8
|
101 |
+
.b8 58
|
102 |
+
.b8 11
|
103 |
+
.b8 59
|
104 |
+
.b8 11
|
105 |
+
.b8 63
|
106 |
+
.b8 12
|
107 |
+
.b8 0
|
108 |
+
.b8 0
|
109 |
+
.b8 0
|
110 |
+
}
|
111 |
+
.section .debug_info
|
112 |
+
{
|
113 |
+
.b32 176
|
114 |
+
.b8 2
|
115 |
+
.b8 0
|
116 |
+
.b32 .debug_abbrev
|
117 |
+
.b8 8
|
118 |
+
.b8 1
|
119 |
+
.b8 116
|
120 |
+
.b8 114
|
121 |
+
.b8 105
|
122 |
+
.b8 116
|
123 |
+
.b8 111
|
124 |
+
.b8 110
|
125 |
+
.b8 0
|
126 |
+
.b8 2
|
127 |
+
.b8 0
|
128 |
+
.b8 99
|
129 |
+
.b8 122
|
130 |
+
.b8 108
|
131 |
+
.b8 54
|
132 |
+
.b8 110
|
133 |
+
.b8 109
|
134 |
+
.b8 119
|
135 |
+
.b8 97
|
136 |
+
.b8 115
|
137 |
+
.b8 108
|
138 |
+
.b8 55
|
139 |
+
.b8 107
|
140 |
+
.b8 52
|
141 |
+
.b8 105
|
142 |
+
.b8 99
|
143 |
+
.b8 53
|
144 |
+
.b8 53
|
145 |
+
.b8 120
|
146 |
+
.b8 111
|
147 |
+
.b8 119
|
148 |
+
.b8 105
|
149 |
+
.b8 104
|
150 |
+
.b8 99
|
151 |
+
.b8 122
|
152 |
+
.b8 99
|
153 |
+
.b8 111
|
154 |
+
.b8 111
|
155 |
+
.b8 104
|
156 |
+
.b8 51
|
157 |
+
.b8 109
|
158 |
+
.b8 104
|
159 |
+
.b8 117
|
160 |
+
.b8 53
|
161 |
+
.b8 118
|
162 |
+
.b8 54
|
163 |
+
.b8 108
|
164 |
+
.b8 115
|
165 |
+
.b8 54
|
166 |
+
.b8 119
|
167 |
+
.b8 50
|
168 |
+
.b8 120
|
169 |
+
.b8 122
|
170 |
+
.b8 113
|
171 |
+
.b8 113
|
172 |
+
.b8 111
|
173 |
+
.b8 99
|
174 |
+
.b8 100
|
175 |
+
.b8 99
|
176 |
+
.b8 50
|
177 |
+
.b8 100
|
178 |
+
.b8 97
|
179 |
+
.b8 55
|
180 |
+
.b8 46
|
181 |
+
.b8 112
|
182 |
+
.b8 121
|
183 |
+
.b8 0
|
184 |
+
.b32 .debug_line
|
185 |
+
.b8 47
|
186 |
+
.b8 116
|
187 |
+
.b8 109
|
188 |
+
.b8 112
|
189 |
+
.b8 47
|
190 |
+
.b8 116
|
191 |
+
.b8 111
|
192 |
+
.b8 114
|
193 |
+
.b8 99
|
194 |
+
.b8 104
|
195 |
+
.b8 105
|
196 |
+
.b8 110
|
197 |
+
.b8 100
|
198 |
+
.b8 117
|
199 |
+
.b8 99
|
200 |
+
.b8 116
|
201 |
+
.b8 111
|
202 |
+
.b8 114
|
203 |
+
.b8 95
|
204 |
+
.b8 114
|
205 |
+
.b8 111
|
206 |
+
.b8 111
|
207 |
+
.b8 116
|
208 |
+
.b8 47
|
209 |
+
.b8 122
|
210 |
+
.b8 108
|
211 |
+
.b8 0
|
212 |
+
.b8 1
|
213 |
+
.b64 $L__func_begin0
|
214 |
+
.b64 $L__func_end0
|
215 |
+
.b8 2
|
216 |
+
.b64 $L__func_begin0
|
217 |
+
.b64 $L__func_end0
|
218 |
+
.b8 1
|
219 |
+
.b8 156
|
220 |
+
.b8 116
|
221 |
+
.b8 114
|
222 |
+
.b8 105
|
223 |
+
.b8 116
|
224 |
+
.b8 111
|
225 |
+
.b8 110
|
226 |
+
.b8 95
|
227 |
+
.b8 95
|
228 |
+
.b8 48
|
229 |
+
.b8 100
|
230 |
+
.b8 49
|
231 |
+
.b8 100
|
232 |
+
.b8 50
|
233 |
+
.b8 100
|
234 |
+
.b8 101
|
235 |
+
.b8 0
|
236 |
+
.b8 116
|
237 |
+
.b8 114
|
238 |
+
.b8 105
|
239 |
+
.b8 116
|
240 |
+
.b8 111
|
241 |
+
.b8 110
|
242 |
+
.b8 95
|
243 |
+
.b8 95
|
244 |
+
.b8 48
|
245 |
+
.b8 100
|
246 |
+
.b8 49
|
247 |
+
.b8 100
|
248 |
+
.b8 50
|
249 |
+
.b8 100
|
250 |
+
.b8 101
|
251 |
+
.b8 0
|
252 |
+
.b8 1
|
253 |
+
.b8 18
|
254 |
+
.b8 1
|
255 |
+
.b8 0
|
256 |
+
}
|
257 |
+
.section .debug_pubnames
|
258 |
+
{
|
259 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
260 |
+
$L__pubNames_start0:
|
261 |
+
.b8 2
|
262 |
+
.b8 0
|
263 |
+
.b32 .debug_info
|
264 |
+
.b32 180
|
265 |
+
.b32 125
|
266 |
+
.b8 116
|
267 |
+
.b8 114
|
268 |
+
.b8 105
|
269 |
+
.b8 116
|
270 |
+
.b8 111
|
271 |
+
.b8 110
|
272 |
+
.b8 95
|
273 |
+
.b8 95
|
274 |
+
.b8 48
|
275 |
+
.b8 100
|
276 |
+
.b8 49
|
277 |
+
.b8 100
|
278 |
+
.b8 50
|
279 |
+
.b8 100
|
280 |
+
.b8 101
|
281 |
+
.b8 0
|
282 |
+
.b32 0
|
283 |
+
$L__pubNames_end0:
|
284 |
+
}
|
285 |
+
.section .debug_pubtypes
|
286 |
+
{
|
287 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
288 |
+
$L__pubTypes_start0:
|
289 |
+
.b8 2
|
290 |
+
.b8 0
|
291 |
+
.b32 .debug_info
|
292 |
+
.b32 180
|
293 |
+
.b32 0
|
294 |
+
$L__pubTypes_end0:
|
295 |
+
}
|
296 |
+
.section .debug_loc { }
|
.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ptx
ADDED
@@ -0,0 +1,777 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7d8de9de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
|
21 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
|
22 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
|
23 |
+
)
|
24 |
+
.maxntid 64, 1, 1
|
25 |
+
{
|
26 |
+
.reg .pred %p<40>;
|
27 |
+
.reg .b16 %rs<13>;
|
28 |
+
.reg .b32 %r<118>;
|
29 |
+
.reg .f32 %f<94>;
|
30 |
+
.reg .b64 %rd<28>;
|
31 |
+
.loc 1 18 0
|
32 |
+
$L__func_begin0:
|
33 |
+
.loc 1 18 0
|
34 |
+
|
35 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
|
36 |
+
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
|
37 |
+
$L__tmp0:
|
38 |
+
.loc 1 26 26
|
39 |
+
mov.u32 %r84, %tid.x;
|
40 |
+
and.b32 %r85, %r84, 31;
|
41 |
+
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
|
42 |
+
ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
|
43 |
+
ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
|
44 |
+
shl.b32 %r86, %r84, 2;
|
45 |
+
ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
|
46 |
+
and.b32 %r87, %r86, 252;
|
47 |
+
ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
|
48 |
+
ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
|
49 |
+
.loc 1 23 28
|
50 |
+
mov.u32 %r1, %ctaid.x;
|
51 |
+
.loc 1 30 40
|
52 |
+
shl.b32 %r88, %r1, 8;
|
53 |
+
.loc 1 30 36
|
54 |
+
or.b32 %r89, %r88, %r87;
|
55 |
+
.loc 1 30 30
|
56 |
+
mul.wide.s32 %rd24, %r89, 2;
|
57 |
+
add.s64 %rd1, %rd17, %rd24;
|
58 |
+
mov.b32 %r4, 0;
|
59 |
+
mov.pred %p1, -1;
|
60 |
+
.loc 1 30 46
|
61 |
+
mov.u32 %r2, 0x0;
|
62 |
+
mov.u32 %r3, 0x0;
|
63 |
+
@%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
|
64 |
+
@!%p1 mov.u32 %r2, %r4;
|
65 |
+
@!%p1 mov.u32 %r3, %r4;
|
66 |
+
cvt.u16.u32 %rs1, %r2;
|
67 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
68 |
+
cvt.u16.u32 %rs3, %r3;
|
69 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
70 |
+
.loc 1 30 67
|
71 |
+
cvt.f32.bf16 %r6, %rs1;
|
72 |
+
mov.b32 %f1, %r6;
|
73 |
+
cvt.f32.bf16 %r7, %rs2;
|
74 |
+
mov.b32 %f2, %r7;
|
75 |
+
cvt.f32.bf16 %r8, %rs3;
|
76 |
+
mov.b32 %f3, %r8;
|
77 |
+
cvt.f32.bf16 %r9, %rs4;
|
78 |
+
mov.b32 %f4, %r9;
|
79 |
+
.loc 1 31 30
|
80 |
+
mul.wide.u32 %rd25, %r87, 4;
|
81 |
+
add.s64 %rd2, %rd18, %rd25;
|
82 |
+
.loc 1 31 35
|
83 |
+
mov.u32 %r10, 0x0;
|
84 |
+
mov.u32 %r11, 0x0;
|
85 |
+
mov.u32 %r12, 0x0;
|
86 |
+
mov.u32 %r13, 0x0;
|
87 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
|
88 |
+
@!%p1 mov.u32 %r10, %r4;
|
89 |
+
@!%p1 mov.u32 %r11, %r4;
|
90 |
+
@!%p1 mov.u32 %r12, %r4;
|
91 |
+
@!%p1 mov.u32 %r13, %r4;
|
92 |
+
mov.b32 %f5, %r10;
|
93 |
+
mov.b32 %f6, %r11;
|
94 |
+
mov.b32 %f7, %r12;
|
95 |
+
mov.b32 %f8, %r13;
|
96 |
+
.loc 1 32 30
|
97 |
+
mul.wide.s32 %rd26, %r89, 4;
|
98 |
+
add.s64 %rd3, %rd19, %rd26;
|
99 |
+
.loc 1 32 46
|
100 |
+
mov.u32 %r18, 0x0;
|
101 |
+
mov.u32 %r19, 0x0;
|
102 |
+
mov.u32 %r20, 0x0;
|
103 |
+
mov.u32 %r21, 0x0;
|
104 |
+
@%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
|
105 |
+
@!%p1 mov.u32 %r18, %r4;
|
106 |
+
@!%p1 mov.u32 %r19, %r4;
|
107 |
+
@!%p1 mov.u32 %r20, %r4;
|
108 |
+
@!%p1 mov.u32 %r21, %r4;
|
109 |
+
mov.b32 %f9, %r18;
|
110 |
+
mov.b32 %f10, %r19;
|
111 |
+
mov.b32 %f11, %r20;
|
112 |
+
mov.b32 %f12, %r21;
|
113 |
+
.loc 1 33 30
|
114 |
+
add.s64 %rd4, %rd20, %rd24;
|
115 |
+
.loc 1 33 46
|
116 |
+
mov.u32 %r26, 0x0;
|
117 |
+
mov.u32 %r27, 0x0;
|
118 |
+
@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
|
119 |
+
@!%p1 mov.u32 %r26, %r4;
|
120 |
+
@!%p1 mov.u32 %r27, %r4;
|
121 |
+
cvt.u16.u32 %rs5, %r26;
|
122 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r26; }
|
123 |
+
cvt.u16.u32 %rs7, %r27;
|
124 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
|
125 |
+
.loc 1 33 67
|
126 |
+
cvt.f32.bf16 %r30, %rs5;
|
127 |
+
mov.b32 %f13, %r30;
|
128 |
+
cvt.f32.bf16 %r31, %rs6;
|
129 |
+
mov.b32 %f14, %r31;
|
130 |
+
cvt.f32.bf16 %r32, %rs7;
|
131 |
+
mov.b32 %f15, %r32;
|
132 |
+
cvt.f32.bf16 %r33, %rs8;
|
133 |
+
mov.b32 %f16, %r33;
|
134 |
+
.loc 1 34 31
|
135 |
+
mul.wide.s32 %rd27, %r1, 4;
|
136 |
+
add.s64 %rd5, %rd21, %rd27;
|
137 |
+
.loc 1 34 36
|
138 |
+
mov.u32 %r34, 0x0;
|
139 |
+
@%p1 ld.global.L1::evict_last.b32 { %r34 }, [ %rd5 + 0 ];
|
140 |
+
mov.b32 %f17, %r34;
|
141 |
+
mov.u32 %r35, 0x0;
|
142 |
+
@%p1 ld.global.L1::evict_last.b32 { %r35 }, [ %rd5 + 0 ];
|
143 |
+
mov.u32 %r36, 0x0;
|
144 |
+
@%p1 ld.global.L1::evict_last.b32 { %r36 }, [ %rd5 + 0 ];
|
145 |
+
mov.u32 %r37, 0x0;
|
146 |
+
@%p1 ld.global.L1::evict_last.b32 { %r37 }, [ %rd5 + 0 ];
|
147 |
+
.loc 1 35 31
|
148 |
+
add.s64 %rd9, %rd22, %rd27;
|
149 |
+
.loc 1 35 36
|
150 |
+
mov.u32 %r63, 0x0;
|
151 |
+
@%p1 ld.global.L1::evict_last.b32 { %r63 }, [ %rd9 + 0 ];
|
152 |
+
mov.b32 %f18, %r63;
|
153 |
+
mov.u32 %r39, 0x0;
|
154 |
+
@%p1 ld.global.L1::evict_last.b32 { %r39 }, [ %rd9 + 0 ];
|
155 |
+
mov.u32 %r40, 0x0;
|
156 |
+
@%p1 ld.global.L1::evict_last.b32 { %r40 }, [ %rd9 + 0 ];
|
157 |
+
mov.u32 %r41, 0x0;
|
158 |
+
@%p1 ld.global.L1::evict_last.b32 { %r41 }, [ %rd9 + 0 ];
|
159 |
+
.loc 1 36 35
|
160 |
+
add.s64 %rd13, %rd16, %rd26;
|
161 |
+
.loc 1 36 51
|
162 |
+
mov.u32 %r42, 0x0;
|
163 |
+
mov.u32 %r43, 0x0;
|
164 |
+
mov.u32 %r44, 0x0;
|
165 |
+
mov.u32 %r45, 0x0;
|
166 |
+
@%p1 ld.global.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd13 + 0 ];
|
167 |
+
@!%p1 mov.u32 %r42, %r4;
|
168 |
+
@!%p1 mov.u32 %r43, %r4;
|
169 |
+
@!%p1 mov.u32 %r44, %r4;
|
170 |
+
@!%p1 mov.u32 %r45, %r4;
|
171 |
+
mov.b32 %f19, %r42;
|
172 |
+
mov.b32 %f20, %r43;
|
173 |
+
mov.b32 %f21, %r44;
|
174 |
+
mov.b32 %f22, %r45;
|
175 |
+
.loc 1 38 18
|
176 |
+
mul.f32 %f23, %f1, %f5;
|
177 |
+
mul.f32 %f24, %f2, %f6;
|
178 |
+
mul.f32 %f25, %f3, %f7;
|
179 |
+
mul.f32 %f26, %f4, %f8;
|
180 |
+
$L__tmp1:
|
181 |
+
.loc 2 233 15
|
182 |
+
fma.rn.f32 %f27, %f1, %f5, %f24;
|
183 |
+
fma.rn.f32 %f28, %f3, %f7, %f27;
|
184 |
+
fma.rn.f32 %f29, %f4, %f8, %f28;
|
185 |
+
$L__tmp2:
|
186 |
+
.loc 2 243 36
|
187 |
+
mov.b32 %r90, %f29;
|
188 |
+
shfl.sync.bfly.b32 %r91, %r90, 16, 31, -1;
|
189 |
+
mov.b32 %f30, %r91;
|
190 |
+
$L__tmp3:
|
191 |
+
.loc 2 233 15
|
192 |
+
add.f32 %f31, %f29, %f30;
|
193 |
+
$L__tmp4:
|
194 |
+
.loc 2 243 36
|
195 |
+
mov.b32 %r92, %f31;
|
196 |
+
shfl.sync.bfly.b32 %r93, %r92, 8, 31, -1;
|
197 |
+
mov.b32 %f32, %r93;
|
198 |
+
$L__tmp5:
|
199 |
+
.loc 2 233 15
|
200 |
+
add.f32 %f33, %f31, %f32;
|
201 |
+
$L__tmp6:
|
202 |
+
.loc 2 243 36
|
203 |
+
mov.b32 %r94, %f33;
|
204 |
+
shfl.sync.bfly.b32 %r95, %r94, 4, 31, -1;
|
205 |
+
mov.b32 %f34, %r95;
|
206 |
+
$L__tmp7:
|
207 |
+
.loc 2 233 15
|
208 |
+
add.f32 %f35, %f33, %f34;
|
209 |
+
$L__tmp8:
|
210 |
+
.loc 2 243 36
|
211 |
+
mov.b32 %r96, %f35;
|
212 |
+
shfl.sync.bfly.b32 %r97, %r96, 2, 31, -1;
|
213 |
+
mov.b32 %f36, %r97;
|
214 |
+
$L__tmp9:
|
215 |
+
.loc 2 233 15
|
216 |
+
add.f32 %f37, %f35, %f36;
|
217 |
+
$L__tmp10:
|
218 |
+
.loc 2 243 36
|
219 |
+
mov.b32 %r98, %f37;
|
220 |
+
shfl.sync.bfly.b32 %r99, %r98, 1, 31, -1;
|
221 |
+
mov.b32 %f38, %r99;
|
222 |
+
$L__tmp11:
|
223 |
+
.loc 2 233 15
|
224 |
+
add.f32 %f39, %f37, %f38;
|
225 |
+
$L__tmp12:
|
226 |
+
.loc 2 243 36
|
227 |
+
setp.eq.s32 %p30, %r85, 0;
|
228 |
+
shr.u32 %r100, %r84, 3;
|
229 |
+
and.b32 %r101, %r100, 4;
|
230 |
+
mov.u32 %r102, global_smem;
|
231 |
+
add.s32 %r50, %r102, %r101;
|
232 |
+
mov.b32 %r51, %f39;
|
233 |
+
@%p30 st.shared.b32 [ %r50 + 0 ], %r51;
|
234 |
+
bar.sync 0;
|
235 |
+
setp.lt.s32 %p31, %r84, 2;
|
236 |
+
add.s32 %r53, %r102, %r86;
|
237 |
+
@%p31 ld.shared.b32 %r52, [ %r53 + 0 ];
|
238 |
+
mov.b32 %f40, %r52;
|
239 |
+
shfl.sync.bfly.b32 %r103, %r52, 1, 31, -1;
|
240 |
+
mov.b32 %f41, %r103;
|
241 |
+
$L__tmp13:
|
242 |
+
.loc 2 233 15
|
243 |
+
add.f32 %f42, %f40, %f41;
|
244 |
+
$L__tmp14:
|
245 |
+
.loc 2 243 36
|
246 |
+
and.b32 %r104, %r84, 1;
|
247 |
+
setp.eq.b32 %p38, %r104, 1;
|
248 |
+
not.pred %p39, %p38;
|
249 |
+
and.pred %p32, %p31, %p39;
|
250 |
+
mov.b32 %r55, %f42;
|
251 |
+
@%p32 st.shared.b32 [ %r53 + 0 ], %r55;
|
252 |
+
bar.sync 0;
|
253 |
+
ld.shared.f32 %f43, [global_smem];
|
254 |
+
$L__tmp15:
|
255 |
+
.loc 3 8 15
|
256 |
+
add.f32 %f44, %f43, 0f00000000;
|
257 |
+
$L__tmp16:
|
258 |
+
.loc 1 43 19
|
259 |
+
add.f32 %f45, %f13, %f9;
|
260 |
+
add.f32 %f46, %f14, %f10;
|
261 |
+
add.f32 %f47, %f15, %f11;
|
262 |
+
add.f32 %f48, %f16, %f12;
|
263 |
+
.loc 1 44 20
|
264 |
+
sub.f32 %f49, %f45, %f17;
|
265 |
+
sub.f32 %f50, %f46, %f17;
|
266 |
+
sub.f32 %f51, %f47, %f17;
|
267 |
+
sub.f32 %f52, %f48, %f17;
|
268 |
+
.loc 1 45 20
|
269 |
+
mul.f32 %f53, %f49, %f18;
|
270 |
+
mul.f32 %f54, %f50, %f18;
|
271 |
+
mul.f32 %f55, %f51, %f18;
|
272 |
+
mul.f32 %f56, %f52, %f18;
|
273 |
+
.loc 1 46 19
|
274 |
+
mul.f32 %f57, %f24, %f54;
|
275 |
+
$L__tmp17:
|
276 |
+
.loc 2 243 36
|
277 |
+
bar.sync 0;
|
278 |
+
$L__tmp18:
|
279 |
+
.loc 2 233 15
|
280 |
+
fma.rn.f32 %f58, %f23, %f53, %f57;
|
281 |
+
fma.rn.f32 %f59, %f25, %f55, %f58;
|
282 |
+
fma.rn.f32 %f60, %f26, %f56, %f59;
|
283 |
+
$L__tmp19:
|
284 |
+
.loc 2 243 36
|
285 |
+
mov.b32 %r105, %f60;
|
286 |
+
shfl.sync.bfly.b32 %r106, %r105, 16, 31, -1;
|
287 |
+
mov.b32 %f61, %r106;
|
288 |
+
$L__tmp20:
|
289 |
+
.loc 2 233 15
|
290 |
+
add.f32 %f62, %f60, %f61;
|
291 |
+
$L__tmp21:
|
292 |
+
.loc 2 243 36
|
293 |
+
mov.b32 %r107, %f62;
|
294 |
+
shfl.sync.bfly.b32 %r108, %r107, 8, 31, -1;
|
295 |
+
mov.b32 %f63, %r108;
|
296 |
+
$L__tmp22:
|
297 |
+
.loc 2 233 15
|
298 |
+
add.f32 %f64, %f62, %f63;
|
299 |
+
$L__tmp23:
|
300 |
+
.loc 2 243 36
|
301 |
+
mov.b32 %r109, %f64;
|
302 |
+
shfl.sync.bfly.b32 %r110, %r109, 4, 31, -1;
|
303 |
+
mov.b32 %f65, %r110;
|
304 |
+
$L__tmp24:
|
305 |
+
.loc 2 233 15
|
306 |
+
add.f32 %f66, %f64, %f65;
|
307 |
+
$L__tmp25:
|
308 |
+
.loc 2 243 36
|
309 |
+
mov.b32 %r111, %f66;
|
310 |
+
shfl.sync.bfly.b32 %r112, %r111, 2, 31, -1;
|
311 |
+
mov.b32 %f67, %r112;
|
312 |
+
$L__tmp26:
|
313 |
+
.loc 2 233 15
|
314 |
+
add.f32 %f68, %f66, %f67;
|
315 |
+
$L__tmp27:
|
316 |
+
.loc 2 243 36
|
317 |
+
mov.b32 %r113, %f68;
|
318 |
+
shfl.sync.bfly.b32 %r114, %r113, 1, 31, -1;
|
319 |
+
mov.b32 %f69, %r114;
|
320 |
+
$L__tmp28:
|
321 |
+
.loc 2 233 15
|
322 |
+
add.f32 %f70, %f68, %f69;
|
323 |
+
$L__tmp29:
|
324 |
+
.loc 2 243 36
|
325 |
+
mov.b32 %r57, %f70;
|
326 |
+
@%p30 st.shared.b32 [ %r50 + 0 ], %r57;
|
327 |
+
bar.sync 0;
|
328 |
+
@%p31 ld.shared.b32 %r58, [ %r53 + 0 ];
|
329 |
+
mov.b32 %f71, %r58;
|
330 |
+
shfl.sync.bfly.b32 %r115, %r58, 1, 31, -1;
|
331 |
+
mov.b32 %f72, %r115;
|
332 |
+
$L__tmp30:
|
333 |
+
.loc 2 233 15
|
334 |
+
add.f32 %f73, %f71, %f72;
|
335 |
+
$L__tmp31:
|
336 |
+
.loc 2 243 36
|
337 |
+
mov.b32 %r61, %f73;
|
338 |
+
@%p32 st.shared.b32 [ %r53 + 0 ], %r61;
|
339 |
+
bar.sync 0;
|
340 |
+
ld.shared.f32 %f74, [global_smem];
|
341 |
+
$L__tmp32:
|
342 |
+
.loc 3 8 15
|
343 |
+
add.f32 %f75, %f74, 0f00000000;
|
344 |
+
mov.b32 %r64, 1132462080;
|
345 |
+
$L__tmp33:
|
346 |
+
.loc 1 51 20
|
347 |
+
div.full.f32 %r62, %r63, %r64;
|
348 |
+
mov.b32 %f76, %r62;
|
349 |
+
.loc 1 53 20
|
350 |
+
neg.f32 %f77, %f44;
|
351 |
+
fma.rn.f32 %f78, %f23, 0f43800000, %f77;
|
352 |
+
fma.rn.f32 %f79, %f24, 0f43800000, %f77;
|
353 |
+
fma.rn.f32 %f80, %f25, 0f43800000, %f77;
|
354 |
+
fma.rn.f32 %f81, %f26, 0f43800000, %f77;
|
355 |
+
.loc 1 55 20
|
356 |
+
neg.f32 %f82, %f53;
|
357 |
+
fma.rn.f32 %f83, %f82, %f75, %f78;
|
358 |
+
neg.f32 %f84, %f54;
|
359 |
+
fma.rn.f32 %f85, %f84, %f75, %f79;
|
360 |
+
neg.f32 %f86, %f55;
|
361 |
+
fma.rn.f32 %f87, %f86, %f75, %f80;
|
362 |
+
neg.f32 %f88, %f56;
|
363 |
+
fma.rn.f32 %f89, %f88, %f75, %f81;
|
364 |
+
.loc 1 57 20
|
365 |
+
fma.rn.f32 %f90, %f76, %f83, %f19;
|
366 |
+
fma.rn.f32 %f91, %f76, %f85, %f20;
|
367 |
+
fma.rn.f32 %f92, %f76, %f87, %f21;
|
368 |
+
fma.rn.f32 %f93, %f76, %f89, %f22;
|
369 |
+
.loc 1 59 51
|
370 |
+
mov.b32 %r74, %f90;
|
371 |
+
mov.b32 %r75, %f91;
|
372 |
+
mov.b32 %r76, %f92;
|
373 |
+
mov.b32 %r77, %f93;
|
374 |
+
@%p1 st.global.v4.b32 [ %rd13 + 0 ], { %r74, %r75, %r76, %r77 };
|
375 |
+
.loc 1 60 25
|
376 |
+
add.s64 %rd15, %rd23, %rd24;
|
377 |
+
.loc 1 60 48
|
378 |
+
cvt.rn.bf16.f32 %rs9, %r74;
|
379 |
+
cvt.rn.bf16.f32 %rs10, %r75;
|
380 |
+
cvt.rn.bf16.f32 %rs11, %r76;
|
381 |
+
cvt.rn.bf16.f32 %rs12, %r77;
|
382 |
+
mov.b32 %r116, {%rs9, %rs10};
|
383 |
+
mov.b32 %r117, {%rs11, %rs12};
|
384 |
+
@%p1 st.global.v2.b32 [ %rd15 + 0 ], { %r116, %r117 };
|
385 |
+
.loc 1 60 4
|
386 |
+
ret;
|
387 |
+
$L__tmp34:
|
388 |
+
$L__func_end0:
|
389 |
+
|
390 |
+
}
|
391 |
+
.file 1 "/tmp/torchinductor_root/fh/cfhjzwujbd4bpel57x4hxw7d3m3qqfwrjg6bfe6e4wk2cyh77u45.py"
|
392 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
393 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
394 |
+
.section .debug_abbrev
|
395 |
+
{
|
396 |
+
.b8 1
|
397 |
+
.b8 17
|
398 |
+
.b8 1
|
399 |
+
.b8 37
|
400 |
+
.b8 8
|
401 |
+
.b8 19
|
402 |
+
.b8 5
|
403 |
+
.b8 3
|
404 |
+
.b8 8
|
405 |
+
.b8 16
|
406 |
+
.b8 6
|
407 |
+
.b8 27
|
408 |
+
.b8 8
|
409 |
+
.b8 180
|
410 |
+
.b8 66
|
411 |
+
.b8 12
|
412 |
+
.b8 17
|
413 |
+
.b8 1
|
414 |
+
.b8 18
|
415 |
+
.b8 1
|
416 |
+
.b8 0
|
417 |
+
.b8 0
|
418 |
+
.b8 2
|
419 |
+
.b8 46
|
420 |
+
.b8 0
|
421 |
+
.b8 135
|
422 |
+
.b8 64
|
423 |
+
.b8 8
|
424 |
+
.b8 3
|
425 |
+
.b8 8
|
426 |
+
.b8 58
|
427 |
+
.b8 11
|
428 |
+
.b8 59
|
429 |
+
.b8 11
|
430 |
+
.b8 63
|
431 |
+
.b8 12
|
432 |
+
.b8 32
|
433 |
+
.b8 11
|
434 |
+
.b8 0
|
435 |
+
.b8 0
|
436 |
+
.b8 3
|
437 |
+
.b8 46
|
438 |
+
.b8 1
|
439 |
+
.b8 17
|
440 |
+
.b8 1
|
441 |
+
.b8 18
|
442 |
+
.b8 1
|
443 |
+
.b8 64
|
444 |
+
.b8 10
|
445 |
+
.b8 49
|
446 |
+
.b8 19
|
447 |
+
.b8 0
|
448 |
+
.b8 0
|
449 |
+
.b8 4
|
450 |
+
.b8 29
|
451 |
+
.b8 1
|
452 |
+
.b8 49
|
453 |
+
.b8 19
|
454 |
+
.b8 17
|
455 |
+
.b8 1
|
456 |
+
.b8 18
|
457 |
+
.b8 1
|
458 |
+
.b8 88
|
459 |
+
.b8 11
|
460 |
+
.b8 89
|
461 |
+
.b8 11
|
462 |
+
.b8 87
|
463 |
+
.b8 11
|
464 |
+
.b8 0
|
465 |
+
.b8 0
|
466 |
+
.b8 5
|
467 |
+
.b8 29
|
468 |
+
.b8 0
|
469 |
+
.b8 49
|
470 |
+
.b8 19
|
471 |
+
.b8 17
|
472 |
+
.b8 1
|
473 |
+
.b8 18
|
474 |
+
.b8 1
|
475 |
+
.b8 88
|
476 |
+
.b8 11
|
477 |
+
.b8 89
|
478 |
+
.b8 11
|
479 |
+
.b8 87
|
480 |
+
.b8 11
|
481 |
+
.b8 0
|
482 |
+
.b8 0
|
483 |
+
.b8 0
|
484 |
+
}
|
485 |
+
.section .debug_info
|
486 |
+
{
|
487 |
+
.b32 407
|
488 |
+
.b8 2
|
489 |
+
.b8 0
|
490 |
+
.b32 .debug_abbrev
|
491 |
+
.b8 8
|
492 |
+
.b8 1
|
493 |
+
.b8 116
|
494 |
+
.b8 114
|
495 |
+
.b8 105
|
496 |
+
.b8 116
|
497 |
+
.b8 111
|
498 |
+
.b8 110
|
499 |
+
.b8 0
|
500 |
+
.b8 2
|
501 |
+
.b8 0
|
502 |
+
.b8 99
|
503 |
+
.b8 102
|
504 |
+
.b8 104
|
505 |
+
.b8 106
|
506 |
+
.b8 122
|
507 |
+
.b8 119
|
508 |
+
.b8 117
|
509 |
+
.b8 106
|
510 |
+
.b8 98
|
511 |
+
.b8 100
|
512 |
+
.b8 52
|
513 |
+
.b8 98
|
514 |
+
.b8 112
|
515 |
+
.b8 101
|
516 |
+
.b8 108
|
517 |
+
.b8 53
|
518 |
+
.b8 55
|
519 |
+
.b8 120
|
520 |
+
.b8 52
|
521 |
+
.b8 104
|
522 |
+
.b8 120
|
523 |
+
.b8 119
|
524 |
+
.b8 55
|
525 |
+
.b8 100
|
526 |
+
.b8 51
|
527 |
+
.b8 109
|
528 |
+
.b8 51
|
529 |
+
.b8 113
|
530 |
+
.b8 113
|
531 |
+
.b8 102
|
532 |
+
.b8 119
|
533 |
+
.b8 114
|
534 |
+
.b8 106
|
535 |
+
.b8 103
|
536 |
+
.b8 54
|
537 |
+
.b8 98
|
538 |
+
.b8 102
|
539 |
+
.b8 101
|
540 |
+
.b8 54
|
541 |
+
.b8 101
|
542 |
+
.b8 52
|
543 |
+
.b8 119
|
544 |
+
.b8 107
|
545 |
+
.b8 50
|
546 |
+
.b8 99
|
547 |
+
.b8 121
|
548 |
+
.b8 104
|
549 |
+
.b8 55
|
550 |
+
.b8 55
|
551 |
+
.b8 117
|
552 |
+
.b8 52
|
553 |
+
.b8 53
|
554 |
+
.b8 46
|
555 |
+
.b8 112
|
556 |
+
.b8 121
|
557 |
+
.b8 0
|
558 |
+
.b32 .debug_line
|
559 |
+
.b8 47
|
560 |
+
.b8 116
|
561 |
+
.b8 109
|
562 |
+
.b8 112
|
563 |
+
.b8 47
|
564 |
+
.b8 116
|
565 |
+
.b8 111
|
566 |
+
.b8 114
|
567 |
+
.b8 99
|
568 |
+
.b8 104
|
569 |
+
.b8 105
|
570 |
+
.b8 110
|
571 |
+
.b8 100
|
572 |
+
.b8 117
|
573 |
+
.b8 99
|
574 |
+
.b8 116
|
575 |
+
.b8 111
|
576 |
+
.b8 114
|
577 |
+
.b8 95
|
578 |
+
.b8 114
|
579 |
+
.b8 111
|
580 |
+
.b8 111
|
581 |
+
.b8 116
|
582 |
+
.b8 47
|
583 |
+
.b8 102
|
584 |
+
.b8 104
|
585 |
+
.b8 0
|
586 |
+
.b8 1
|
587 |
+
.b64 $L__func_begin0
|
588 |
+
.b64 $L__func_end0
|
589 |
+
.b8 2
|
590 |
+
.b8 116
|
591 |
+
.b8 114
|
592 |
+
.b8 105
|
593 |
+
.b8 116
|
594 |
+
.b8 111
|
595 |
+
.b8 110
|
596 |
+
.b8 95
|
597 |
+
.b8 95
|
598 |
+
.b8 48
|
599 |
+
.b8 100
|
600 |
+
.b8 49
|
601 |
+
.b8 100
|
602 |
+
.b8 50
|
603 |
+
.b8 100
|
604 |
+
.b8 51
|
605 |
+
.b8 100
|
606 |
+
.b8 52
|
607 |
+
.b8 100
|
608 |
+
.b8 53
|
609 |
+
.b8 100
|
610 |
+
.b8 54
|
611 |
+
.b8 100
|
612 |
+
.b8 55
|
613 |
+
.b8 100
|
614 |
+
.b8 56
|
615 |
+
.b8 100
|
616 |
+
.b8 101
|
617 |
+
.b8 57
|
618 |
+
.b8 100
|
619 |
+
.b8 101
|
620 |
+
.b8 0
|
621 |
+
.b8 116
|
622 |
+
.b8 114
|
623 |
+
.b8 105
|
624 |
+
.b8 116
|
625 |
+
.b8 111
|
626 |
+
.b8 110
|
627 |
+
.b8 95
|
628 |
+
.b8 95
|
629 |
+
.b8 48
|
630 |
+
.b8 100
|
631 |
+
.b8 49
|
632 |
+
.b8 100
|
633 |
+
.b8 50
|
634 |
+
.b8 100
|
635 |
+
.b8 51
|
636 |
+
.b8 100
|
637 |
+
.b8 52
|
638 |
+
.b8 100
|
639 |
+
.b8 53
|
640 |
+
.b8 100
|
641 |
+
.b8 54
|
642 |
+
.b8 100
|
643 |
+
.b8 55
|
644 |
+
.b8 100
|
645 |
+
.b8 56
|
646 |
+
.b8 100
|
647 |
+
.b8 101
|
648 |
+
.b8 57
|
649 |
+
.b8 100
|
650 |
+
.b8 101
|
651 |
+
.b8 0
|
652 |
+
.b8 1
|
653 |
+
.b8 18
|
654 |
+
.b8 1
|
655 |
+
.b8 1
|
656 |
+
.b8 3
|
657 |
+
.b64 $L__func_begin0
|
658 |
+
.b64 $L__func_end0
|
659 |
+
.b8 1
|
660 |
+
.b8 156
|
661 |
+
.b32 125
|
662 |
+
.b8 4
|
663 |
+
.b32 125
|
664 |
+
.b64 $L__tmp1
|
665 |
+
.b64 $L__tmp14
|
666 |
+
.b8 2
|
667 |
+
.b8 41
|
668 |
+
.b8 57
|
669 |
+
.b8 5
|
670 |
+
.b32 125
|
671 |
+
.b64 $L__tmp1
|
672 |
+
.b64 $L__tmp14
|
673 |
+
.b8 2
|
674 |
+
.b8 243
|
675 |
+
.b8 36
|
676 |
+
.b8 0
|
677 |
+
.b8 5
|
678 |
+
.b32 125
|
679 |
+
.b64 $L__tmp2
|
680 |
+
.b64 $L__tmp15
|
681 |
+
.b8 2
|
682 |
+
.b8 41
|
683 |
+
.b8 57
|
684 |
+
.b8 5
|
685 |
+
.b32 125
|
686 |
+
.b64 $L__tmp15
|
687 |
+
.b64 $L__tmp16
|
688 |
+
.b8 3
|
689 |
+
.b8 41
|
690 |
+
.b8 44
|
691 |
+
.b8 5
|
692 |
+
.b32 125
|
693 |
+
.b64 $L__tmp17
|
694 |
+
.b64 $L__tmp32
|
695 |
+
.b8 2
|
696 |
+
.b8 49
|
697 |
+
.b8 59
|
698 |
+
.b8 4
|
699 |
+
.b32 125
|
700 |
+
.b64 $L__tmp18
|
701 |
+
.b64 $L__tmp31
|
702 |
+
.b8 2
|
703 |
+
.b8 49
|
704 |
+
.b8 59
|
705 |
+
.b8 5
|
706 |
+
.b32 125
|
707 |
+
.b64 $L__tmp18
|
708 |
+
.b64 $L__tmp31
|
709 |
+
.b8 2
|
710 |
+
.b8 243
|
711 |
+
.b8 36
|
712 |
+
.b8 0
|
713 |
+
.b8 5
|
714 |
+
.b32 125
|
715 |
+
.b64 $L__tmp32
|
716 |
+
.b64 $L__tmp33
|
717 |
+
.b8 3
|
718 |
+
.b8 49
|
719 |
+
.b8 45
|
720 |
+
.b8 0
|
721 |
+
.b8 0
|
722 |
+
}
|
723 |
+
.section .debug_pubnames
|
724 |
+
{
|
725 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
726 |
+
$L__pubNames_start0:
|
727 |
+
.b8 2
|
728 |
+
.b8 0
|
729 |
+
.b32 .debug_info
|
730 |
+
.b32 411
|
731 |
+
.b32 125
|
732 |
+
.b8 116
|
733 |
+
.b8 114
|
734 |
+
.b8 105
|
735 |
+
.b8 116
|
736 |
+
.b8 111
|
737 |
+
.b8 110
|
738 |
+
.b8 95
|
739 |
+
.b8 95
|
740 |
+
.b8 48
|
741 |
+
.b8 100
|
742 |
+
.b8 49
|
743 |
+
.b8 100
|
744 |
+
.b8 50
|
745 |
+
.b8 100
|
746 |
+
.b8 51
|
747 |
+
.b8 100
|
748 |
+
.b8 52
|
749 |
+
.b8 100
|
750 |
+
.b8 53
|
751 |
+
.b8 100
|
752 |
+
.b8 54
|
753 |
+
.b8 100
|
754 |
+
.b8 55
|
755 |
+
.b8 100
|
756 |
+
.b8 56
|
757 |
+
.b8 100
|
758 |
+
.b8 101
|
759 |
+
.b8 57
|
760 |
+
.b8 100
|
761 |
+
.b8 101
|
762 |
+
.b8 0
|
763 |
+
.b32 0
|
764 |
+
$L__pubNames_end0:
|
765 |
+
}
|
766 |
+
.section .debug_pubtypes
|
767 |
+
{
|
768 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
769 |
+
$L__pubTypes_start0:
|
770 |
+
.b8 2
|
771 |
+
.b8 0
|
772 |
+
.b32 .debug_info
|
773 |
+
.b32 411
|
774 |
+
.b32 0
|
775 |
+
$L__pubTypes_end0:
|
776 |
+
}
|
777 |
+
.section .debug_loc { }
|
.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ttgir
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant 0.000000e+00 : f32
|
7 |
+
%c256_i32 = arith.constant 256 : i32
|
8 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
20 |
+
%9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
21 |
+
%10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
22 |
+
%11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
23 |
+
%12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
27 |
+
%16 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
28 |
+
%17 = tt.addptr %16, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
29 |
+
%18 = tt.load %17, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
30 |
+
%19 = arith.extf %18 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
31 |
+
%20 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
|
32 |
+
%21 = tt.splat %20 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
|
33 |
+
%22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
|
34 |
+
%23 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
|
35 |
+
%24 = tt.splat %23 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
|
36 |
+
%25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
|
37 |
+
%26 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
38 |
+
%27 = tt.addptr %26, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
39 |
+
%28 = tt.load %27, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
40 |
+
%29 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
|
41 |
+
%30 = arith.select %2, %29, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
42 |
+
%31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
|
43 |
+
^bb0(%arg10: f32, %arg11: f32):
|
44 |
+
%55 = arith.addf %arg10, %arg11 : f32
|
45 |
+
tt.reduce.return %55 : f32
|
46 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
47 |
+
%32 = arith.addf %31, %cst_1 : f32
|
48 |
+
%33 = arith.addf %15, %19 : tensor<256xf32, #blocked>
|
49 |
+
%34 = tt.broadcast %22 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
50 |
+
%35 = arith.subf %33, %34 : tensor<256xf32, #blocked>
|
51 |
+
%36 = tt.broadcast %25 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
52 |
+
%37 = arith.mulf %35, %36 : tensor<256xf32, #blocked>
|
53 |
+
%38 = arith.mulf %29, %37 : tensor<256xf32, #blocked>
|
54 |
+
%39 = arith.select %2, %38, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
55 |
+
%40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({
|
56 |
+
^bb0(%arg10: f32, %arg11: f32):
|
57 |
+
%55 = arith.addf %arg10, %arg11 : f32
|
58 |
+
tt.reduce.return %55 : f32
|
59 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
60 |
+
%41 = arith.addf %40, %cst_1 : f32
|
61 |
+
%42 = arith.divf %25, %cst_0 : tensor<1xf32, #blocked>
|
62 |
+
%43 = arith.mulf %29, %cst_3 : tensor<256xf32, #blocked>
|
63 |
+
%44 = tt.splat %32 : (f32) -> tensor<256xf32, #blocked>
|
64 |
+
%45 = arith.subf %43, %44 : tensor<256xf32, #blocked>
|
65 |
+
%46 = tt.splat %41 : (f32) -> tensor<256xf32, #blocked>
|
66 |
+
%47 = arith.mulf %37, %46 : tensor<256xf32, #blocked>
|
67 |
+
%48 = arith.subf %45, %47 : tensor<256xf32, #blocked>
|
68 |
+
%49 = tt.broadcast %42 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
69 |
+
%50 = arith.mulf %49, %48 : tensor<256xf32, #blocked>
|
70 |
+
%51 = arith.addf %28, %50 : tensor<256xf32, #blocked>
|
71 |
+
tt.store %27, %51, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
72 |
+
%52 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
73 |
+
%53 = tt.addptr %52, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
74 |
+
%54 = arith.truncf %51 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
75 |
+
tt.store %53, %54, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
76 |
+
tt.return
|
77 |
+
}
|
78 |
+
}
|
.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttgir
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<0> : tensor<1xi64, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<50257> : tensor<1xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked>
|
9 |
+
%cst_3 = arith.constant 9.99999974E-6 : f32
|
10 |
+
%cst_4 = arith.constant 2.560000e+02 : f32
|
11 |
+
%cst_5 = arith.constant 0.000000e+00 : f32
|
12 |
+
%c256_i32 = arith.constant 256 : i32
|
13 |
+
%c512_i32 = arith.constant 512 : i32
|
14 |
+
%cst_6 = arith.constant dense<50257> : tensor<1xi64, #blocked1>
|
15 |
+
%cst_7 = arith.constant dense<0> : tensor<1xi64, #blocked1>
|
16 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
17 |
+
%cst_9 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
18 |
+
%0 = tt.get_program_id x : i32
|
19 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
20 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
21 |
+
%3 = arith.remsi %0, %c512_i32 : i32
|
22 |
+
%4 = tt.addptr %arg0, %0 : !tt.ptr<i64, 1>, i32
|
23 |
+
%5 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked>
|
24 |
+
%6 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked1>
|
25 |
+
%7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked>
|
26 |
+
%8 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked1>
|
27 |
+
%9 = arith.muli %3, %c256_i32 : i32
|
28 |
+
%10 = tt.splat %9 : (i32) -> tensor<256xi32, #blocked>
|
29 |
+
%11 = arith.addi %1, %10 : tensor<256xi32, #blocked>
|
30 |
+
%12 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
31 |
+
%13 = tt.addptr %12, %11 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
32 |
+
%14 = tt.load %13, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
33 |
+
%15 = arith.muli %0, %c256_i32 : i32
|
34 |
+
%16 = tt.splat %15 : (i32) -> tensor<256xi32, #blocked>
|
35 |
+
%17 = arith.addi %1, %16 : tensor<256xi32, #blocked>
|
36 |
+
%18 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
37 |
+
%19 = tt.addptr %18, %17 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
38 |
+
%20 = tt.load %19, %2, %cst_9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
39 |
+
%21 = arith.extf %20 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
40 |
+
%22 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
41 |
+
%23 = tt.addptr %22, %17 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
42 |
+
%24 = tt.load %23, %2, %cst_9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
43 |
+
%25 = arith.extf %24 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
44 |
+
%26 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
45 |
+
%27 = tt.addptr %26, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
46 |
+
%28 = tt.load %27, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
47 |
+
%29 = arith.addi %7, %cst_1 : tensor<1xi64, #blocked>
|
48 |
+
%30 = arith.addi %8, %cst_6 : tensor<1xi64, #blocked1>
|
49 |
+
%31 = arith.cmpi slt, %7, %cst_0 : tensor<1xi64, #blocked>
|
50 |
+
%32 = arith.cmpi slt, %8, %cst_7 : tensor<1xi64, #blocked1>
|
51 |
+
%33 = arith.select %31, %29, %7 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked>
|
52 |
+
%34 = arith.select %32, %30, %8 : tensor<1xi1, #blocked1>, tensor<1xi64, #blocked1>
|
53 |
+
%35 = arith.cmpi sge, %34, %cst_7 : tensor<1xi64, #blocked1>
|
54 |
+
%36 = arith.cmpi slt, %34, %cst_6 : tensor<1xi64, #blocked1>
|
55 |
+
%37 = arith.andi %35, %36 : tensor<1xi1, #blocked1>
|
56 |
+
tt.assert %37, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1xi1, #blocked1>
|
57 |
+
%38 = arith.muli %33, %cst_2 : tensor<1xi64, #blocked>
|
58 |
+
%39 = tt.broadcast %38 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked>
|
59 |
+
%40 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
|
60 |
+
%41 = arith.addi %40, %39 : tensor<256xi64, #blocked>
|
61 |
+
%42 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
62 |
+
%43 = tt.addptr %42, %41 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi64, #blocked>
|
63 |
+
%44 = tt.load %43, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
64 |
+
%45 = arith.addf %44, %14 : tensor<256xf32, #blocked>
|
65 |
+
%46 = arith.addf %45, %21 : tensor<256xf32, #blocked>
|
66 |
+
%47 = arith.addf %46, %25 : tensor<256xf32, #blocked>
|
67 |
+
%48 = arith.select %2, %47, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
68 |
+
%49 = "tt.reduce"(%48) <{axis = 0 : i32}> ({
|
69 |
+
^bb0(%arg10: f32, %arg11: f32):
|
70 |
+
%69 = arith.addf %arg10, %arg11 : f32
|
71 |
+
tt.reduce.return %69 : f32
|
72 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
73 |
+
%50 = arith.addf %49, %cst_5 : f32
|
74 |
+
%51 = arith.divf %50, %cst_4 : f32
|
75 |
+
%52 = tt.splat %51 : (f32) -> tensor<256xf32, #blocked>
|
76 |
+
%53 = arith.subf %47, %52 : tensor<256xf32, #blocked>
|
77 |
+
%54 = arith.mulf %53, %53 : tensor<256xf32, #blocked>
|
78 |
+
%55 = arith.select %2, %54, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
79 |
+
%56 = "tt.reduce"(%55) <{axis = 0 : i32}> ({
|
80 |
+
^bb0(%arg10: f32, %arg11: f32):
|
81 |
+
%69 = arith.addf %arg10, %arg11 : f32
|
82 |
+
tt.reduce.return %69 : f32
|
83 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
84 |
+
%57 = arith.addf %56, %cst_5 : f32
|
85 |
+
%58 = arith.divf %57, %cst_4 : f32
|
86 |
+
%59 = arith.addf %58, %cst_3 : f32
|
87 |
+
%60 = tt.extern_elementwise %59 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
88 |
+
%61 = tt.splat %60 : (f32) -> tensor<256xf32, #blocked>
|
89 |
+
%62 = arith.mulf %53, %61 : tensor<256xf32, #blocked>
|
90 |
+
%63 = arith.mulf %62, %28 : tensor<256xf32, #blocked>
|
91 |
+
%64 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
92 |
+
%65 = tt.addptr %64, %17 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
93 |
+
tt.store %65, %47, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
94 |
+
%66 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
95 |
+
%67 = tt.addptr %66, %17 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
96 |
+
%68 = arith.truncf %63 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
97 |
+
tt.store %67, %68, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
98 |
+
tt.return
|
99 |
+
}
|
100 |
+
}
|
.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ttgir
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked>
|
5 |
+
%c512_i32 = arith.constant 512 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
|
11 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
|
12 |
+
%6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
|
13 |
+
tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
|
14 |
+
tt.return
|
15 |
+
}
|
16 |
+
}
|