0-hero commited on
Commit
8c1fe04
·
verified ·
1 Parent(s): 9ab9a5e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir +366 -0
  2. .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx +807 -0
  3. .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir +76 -0
  4. .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir +57 -0
  5. .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin +0 -0
  6. .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx +764 -0
  7. .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir +26 -0
  8. .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir +25 -0
  9. .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin +0 -0
  10. .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir +290 -0
  11. .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx +653 -0
  12. .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir +60 -0
  13. .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir +53 -0
  14. .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir +24 -0
  15. .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin +0 -0
  16. .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir +63 -0
  17. .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx +577 -0
  18. .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir +65 -0
  19. .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin +0 -0
  20. .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx +743 -0
  21. .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir +72 -0
  22. .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin +0 -0
  23. .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir +132 -0
  24. .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir +304 -0
  25. .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir +62 -0
  26. .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir +61 -0
  27. .triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.llir +213 -0
  28. .triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx +495 -0
  29. .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx +486 -0
  30. .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir +38 -0
  31. .triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir +53 -0
  32. .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.cubin +0 -0
  33. .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ptx +709 -0
  34. .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir +66 -0
  35. .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin +0 -0
  36. .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir +15 -0
  37. .triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin +0 -0
  38. .triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir +38 -0
  39. .triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir +37 -0
  40. .triton/dump/962d1809855a53123762906133b1d960/triton_.ttir +17 -0
  41. .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir +368 -0
  42. .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir +127 -0
  43. .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir +333 -0
  44. .triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.cubin +0 -0
  45. .triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.llir +230 -0
  46. .triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ptx +296 -0
  47. .triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ptx +777 -0
  48. .triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ttgir +78 -0
  49. .triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttgir +100 -0
  50. .triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ttgir +16 -0
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
6
+
7
+ define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !7 {
8
+ %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
9
+ %12 = and i32 %11, 31, !dbg !10
10
+ %13 = lshr i32 %11, 5, !dbg !10
11
+ %14 = and i32 %13, 1, !dbg !10
12
+ %urem = shl i32 %11, 2, !dbg !10
13
+ %15 = and i32 %urem, 252, !dbg !10
14
+ %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
15
+ %17 = shl i32 %16, 8, !dbg !12
16
+ %18 = or i32 %17, %15, !dbg !13
17
+ %19 = sext i32 %18 to i64, !dbg !14
18
+ %20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !14
19
+ %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
20
+ %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !15
21
+ %23 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !15
22
+ %24 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !15
23
+ %25 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !15
24
+ %26 = bitcast i32 %22 to float, !dbg !15
25
+ %27 = bitcast i32 %23 to float, !dbg !15
26
+ %28 = bitcast i32 %24 to float, !dbg !15
27
+ %29 = bitcast i32 %25 to float, !dbg !15
28
+ %30 = getelementptr i16, ptr addrspace(1) %1, i64 %19, !dbg !16
29
+ %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
30
+ %32 = extractvalue { i32, i32 } %31, 0, !dbg !17
31
+ %33 = extractvalue { i32, i32 } %31, 1, !dbg !17
32
+ %34 = trunc i32 %32 to i16, !dbg !17
33
+ %extelt.offset = lshr i32 %32, 16, !dbg !17
34
+ %35 = trunc i32 %extelt.offset to i16, !dbg !17
35
+ %36 = trunc i32 %33 to i16, !dbg !17
36
+ %extelt.offset1 = lshr i32 %33, 16, !dbg !17
37
+ %37 = trunc i32 %extelt.offset1 to i16, !dbg !17
38
+ %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18
39
+ %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #6, !dbg !18
40
+ %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #6, !dbg !18
41
+ %41 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %37) #6, !dbg !18
42
+ %42 = getelementptr i16, ptr addrspace(1) %2, i64 %19, !dbg !19
43
+ %43 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
44
+ %44 = extractvalue { i32, i32 } %43, 0, !dbg !20
45
+ %45 = extractvalue { i32, i32 } %43, 1, !dbg !20
46
+ %46 = trunc i32 %44 to i16, !dbg !20
47
+ %extelt.offset2 = lshr i32 %44, 16, !dbg !20
48
+ %47 = trunc i32 %extelt.offset2 to i16, !dbg !20
49
+ %48 = trunc i32 %45 to i16, !dbg !20
50
+ %extelt.offset3 = lshr i32 %45, 16, !dbg !20
51
+ %49 = trunc i32 %extelt.offset3 to i16, !dbg !20
52
+ %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !21
53
+ %51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !21
54
+ %52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #6, !dbg !21
55
+ %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #6, !dbg !21
56
+ %54 = getelementptr i16, ptr addrspace(1) %3, i64 %19, !dbg !22
57
+ %55 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
58
+ %56 = extractvalue { i32, i32 } %55, 0, !dbg !23
59
+ %57 = extractvalue { i32, i32 } %55, 1, !dbg !23
60
+ %58 = trunc i32 %56 to i16, !dbg !23
61
+ %extelt.offset4 = lshr i32 %56, 16, !dbg !23
62
+ %59 = trunc i32 %extelt.offset4 to i16, !dbg !23
63
+ %60 = trunc i32 %57 to i16, !dbg !23
64
+ %extelt.offset5 = lshr i32 %57, 16, !dbg !23
65
+ %61 = trunc i32 %extelt.offset5 to i16, !dbg !23
66
+ %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #6, !dbg !24
67
+ %63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #6, !dbg !24
68
+ %64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #6, !dbg !24
69
+ %65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %61) #6, !dbg !24
70
+ %66 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !25
71
+ %67 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %66, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
72
+ %68 = extractvalue { i32, i32 } %67, 0, !dbg !26
73
+ %69 = extractvalue { i32, i32 } %67, 1, !dbg !26
74
+ %70 = trunc i32 %68 to i16, !dbg !26
75
+ %extelt.offset6 = lshr i32 %68, 16, !dbg !26
76
+ %71 = trunc i32 %extelt.offset6 to i16, !dbg !26
77
+ %72 = trunc i32 %69 to i16, !dbg !26
78
+ %extelt.offset7 = lshr i32 %69, 16, !dbg !26
79
+ %73 = trunc i32 %extelt.offset7 to i16, !dbg !26
80
+ %74 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #6, !dbg !27
81
+ %75 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #6, !dbg !27
82
+ %76 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #6, !dbg !27
83
+ %77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #6, !dbg !27
84
+ %78 = zext nneg i32 %15 to i64, !dbg !28
85
+ %79 = getelementptr float, ptr addrspace(1) %5, i64 %78, !dbg !28
86
+ %80 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
87
+ %81 = fadd float %38, %26, !dbg !30
88
+ %82 = fadd float %39, %27, !dbg !30
89
+ %83 = fadd float %40, %28, !dbg !30
90
+ %84 = fadd float %81, %50, !dbg !31
91
+ %85 = fadd float %82, %51, !dbg !31
92
+ %86 = fadd float %83, %52, !dbg !31
93
+ %87 = fadd float %85, %63, !dbg !32
94
+ %88 = fadd float %86, %64, !dbg !32
95
+ %89 = fadd float %87, %75, !dbg !33
96
+ %90 = fadd float %88, %76, !dbg !33
97
+ %91 = insertelement <2 x float> poison, float %84, i64 0, !dbg !32
98
+ %92 = insertelement <2 x float> %91, float %41, i64 1, !dbg !32
99
+ %93 = insertelement <2 x float> poison, float %62, i64 0, !dbg !32
100
+ %94 = insertelement <2 x float> %93, float %29, i64 1, !dbg !32
101
+ %95 = fadd <2 x float> %92, %94, !dbg !32
102
+ %96 = insertelement <2 x float> poison, float %74, i64 0, !dbg !33
103
+ %97 = insertelement <2 x float> %96, float %53, i64 1, !dbg !33
104
+ %98 = fadd <2 x float> %95, %97, !dbg !33
105
+ %99 = insertelement <2 x float> poison, float %89, i64 0, !dbg !34
106
+ %100 = insertelement <2 x float> %99, float %65, i64 1, !dbg !34
107
+ %101 = fadd <2 x float> %98, %100, !dbg !34
108
+ %102 = insertelement <2 x float> poison, float %90, i64 0, !dbg !34
109
+ %103 = insertelement <2 x float> %102, float %77, i64 1, !dbg !34
110
+ %104 = fadd <2 x float> %101, %103, !dbg !34
111
+ %105 = extractelement <2 x float> %104, i64 0, !dbg !34
112
+ %106 = extractelement <2 x float> %104, i64 1, !dbg !34
113
+ %107 = fadd float %105, %106, !dbg !34
114
+ %108 = bitcast float %107 to i32, !dbg !40
115
+ %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !40
116
+ %110 = bitcast i32 %109 to float, !dbg !40
117
+ %111 = fadd float %107, %110, !dbg !34
118
+ %112 = bitcast float %111 to i32, !dbg !40
119
+ %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !40
120
+ %114 = bitcast i32 %113 to float, !dbg !40
121
+ %115 = fadd float %111, %114, !dbg !34
122
+ %116 = bitcast float %115 to i32, !dbg !40
123
+ %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 4, i32 31), !dbg !40
124
+ %118 = bitcast i32 %117 to float, !dbg !40
125
+ %119 = fadd float %115, %118, !dbg !34
126
+ %120 = bitcast float %119 to i32, !dbg !40
127
+ %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 2, i32 31), !dbg !40
128
+ %122 = bitcast i32 %121 to float, !dbg !40
129
+ %123 = fadd float %119, %122, !dbg !34
130
+ %124 = bitcast float %123 to i32, !dbg !40
131
+ %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 1, i32 31), !dbg !40
132
+ %126 = bitcast i32 %125 to float, !dbg !40
133
+ %127 = fadd float %123, %126, !dbg !34
134
+ %128 = icmp eq i32 %12, 0, !dbg !40
135
+ %129 = zext nneg i32 %14 to i64, !dbg !40
136
+ %130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !40
137
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %127, i1 %128) #6, !dbg !40
138
+ tail call void @llvm.nvvm.barrier0(), !dbg !40
139
+ %131 = icmp slt i32 %11, 2, !dbg !40
140
+ %132 = sext i32 %11 to i64, !dbg !40
141
+ %133 = getelementptr float, ptr addrspace(3) @global_smem, i64 %132, !dbg !40
142
+ %134 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !40
143
+ %135 = bitcast float %134 to i32, !dbg !40
144
+ %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 1, i32 31), !dbg !40
145
+ %137 = bitcast i32 %136 to float, !dbg !40
146
+ %138 = fadd float %134, %137, !dbg !34
147
+ %139 = and i32 %11, 1, !dbg !40
148
+ %140 = icmp eq i32 %139, 0, !dbg !40
149
+ %141 = and i1 %131, %140, !dbg !40
150
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %138, i1 %141) #6, !dbg !40
151
+ tail call void @llvm.nvvm.barrier0(), !dbg !40
152
+ %142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !40
153
+ %143 = fadd float %142, 0.000000e+00, !dbg !42
154
+ %144 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %143, float 2.560000e+02) #6, !dbg !46
155
+ %145 = extractelement <2 x float> %98, i64 0, !dbg !47
156
+ %146 = fsub float %145, %144, !dbg !47
157
+ %147 = fsub float %89, %144, !dbg !47
158
+ %148 = fsub float %90, %144, !dbg !47
159
+ %149 = fsub float %106, %144, !dbg !47
160
+ %150 = fmul float %146, %146, !dbg !48
161
+ %151 = fmul float %147, %147, !dbg !48
162
+ %152 = fmul float %148, %148, !dbg !48
163
+ %153 = fmul float %149, %149, !dbg !48
164
+ tail call void @llvm.nvvm.barrier0(), !dbg !49
165
+ %154 = fadd float %150, %151, !dbg !51
166
+ %155 = fadd float %152, %154, !dbg !51
167
+ %156 = fadd float %153, %155, !dbg !51
168
+ %157 = bitcast float %156 to i32, !dbg !49
169
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !49
170
+ %159 = bitcast i32 %158 to float, !dbg !49
171
+ %160 = fadd float %156, %159, !dbg !51
172
+ %161 = bitcast float %160 to i32, !dbg !49
173
+ %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !49
174
+ %163 = bitcast i32 %162 to float, !dbg !49
175
+ %164 = fadd float %160, %163, !dbg !51
176
+ %165 = bitcast float %164 to i32, !dbg !49
177
+ %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 4, i32 31), !dbg !49
178
+ %167 = bitcast i32 %166 to float, !dbg !49
179
+ %168 = fadd float %164, %167, !dbg !51
180
+ %169 = bitcast float %168 to i32, !dbg !49
181
+ %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 2, i32 31), !dbg !49
182
+ %171 = bitcast i32 %170 to float, !dbg !49
183
+ %172 = fadd float %168, %171, !dbg !51
184
+ %173 = bitcast float %172 to i32, !dbg !49
185
+ %174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 1, i32 31), !dbg !49
186
+ %175 = bitcast i32 %174 to float, !dbg !49
187
+ %176 = fadd float %172, %175, !dbg !51
188
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %176, i1 %128) #6, !dbg !49
189
+ tail call void @llvm.nvvm.barrier0(), !dbg !49
190
+ %177 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !49
191
+ %178 = bitcast float %177 to i32, !dbg !49
192
+ %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !49
193
+ %180 = bitcast i32 %179 to float, !dbg !49
194
+ %181 = fadd float %177, %180, !dbg !51
195
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %181, i1 %141) #6, !dbg !49
196
+ tail call void @llvm.nvvm.barrier0(), !dbg !49
197
+ %182 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !49
198
+ %183 = fadd float %182, 0.000000e+00, !dbg !54
199
+ %184 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float 2.560000e+02) #6, !dbg !56
200
+ %185 = fadd float %184, 0x3EE4F8B580000000, !dbg !57
201
+ %186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58
202
+ %.not.i = icmp eq i32 %186, 0, !dbg !58
203
+ br i1 %.not.i, label %189, label %187, !dbg !58
204
+
205
+ 187: ; preds = %10
206
+ %188 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %185), !dbg !58
207
+ br label %__nv_rsqrtf.exit, !dbg !58
208
+
209
+ 189: ; preds = %10
210
+ %190 = tail call float @llvm.nvvm.rsqrt.approx.f(float %185), !dbg !58
211
+ br label %__nv_rsqrtf.exit, !dbg !58
212
+
213
+ __nv_rsqrtf.exit: ; preds = %187, %189
214
+ %.0.i = phi float [ %188, %187 ], [ %190, %189 ], !dbg !58
215
+ %191 = extractvalue { i32, i32, i32, i32 } %80, 3, !dbg !29
216
+ %192 = bitcast i32 %191 to float, !dbg !29
217
+ %193 = extractvalue { i32, i32, i32, i32 } %80, 2, !dbg !29
218
+ %194 = bitcast i32 %193 to float, !dbg !29
219
+ %195 = extractvalue { i32, i32, i32, i32 } %80, 1, !dbg !29
220
+ %196 = bitcast i32 %195 to float, !dbg !29
221
+ %197 = extractvalue { i32, i32, i32, i32 } %80, 0, !dbg !29
222
+ %198 = bitcast i32 %197 to float, !dbg !29
223
+ %199 = fmul float %146, %.0.i, !dbg !59
224
+ %200 = fmul float %147, %.0.i, !dbg !59
225
+ %201 = fmul float %148, %.0.i, !dbg !59
226
+ %202 = fmul float %149, %.0.i, !dbg !59
227
+ %203 = fmul float %199, %198, !dbg !60
228
+ %204 = fmul float %200, %196, !dbg !60
229
+ %205 = fmul float %201, %194, !dbg !60
230
+ %206 = fmul float %202, %192, !dbg !60
231
+ %207 = getelementptr float, ptr addrspace(1) %6, i64 %19, !dbg !61
232
+ %208 = bitcast float %145 to i32, !dbg !62
233
+ %209 = bitcast float %89 to i32, !dbg !62
234
+ %210 = bitcast float %90 to i32, !dbg !62
235
+ %211 = bitcast float %106 to i32, !dbg !62
236
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %207, i1 true) #6, !dbg !62
237
+ %212 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !63
238
+ %213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %203) #6, !dbg !64
239
+ %214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #6, !dbg !64
240
+ %215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #6, !dbg !64
241
+ %216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #6, !dbg !64
242
+ %217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !64
243
+ %218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !64
244
+ %219 = bitcast <2 x i16> %218 to i32, !dbg !64
245
+ %220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !64
246
+ %221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !64
247
+ %222 = bitcast <2 x i16> %221 to i32, !dbg !64
248
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #6, !dbg !64
249
+ ret void, !dbg !65
250
+ }
251
+
252
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
253
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
254
+
255
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
256
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
257
+
258
+ ; Function Attrs: convergent nocallback nounwind
259
+ declare void @llvm.nvvm.barrier0() #2
260
+
261
+ ; Function Attrs: alwaysinline nounwind
262
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
263
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
264
+ %.not = icmp eq i32 %1, 0
265
+ br i1 %.not, label %4, label %2
266
+
267
+ 2: ; preds = %0
268
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
269
+ br label %6
270
+
271
+ 4: ; preds = %0
272
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
273
+ br label %6
274
+
275
+ 6: ; preds = %4, %2
276
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
277
+ ret float %.0
278
+ }
279
+
280
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
281
+
282
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
283
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
284
+
285
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
286
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
287
+
288
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
289
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
290
+ attributes #2 = { convergent nocallback nounwind }
291
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
292
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
293
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
294
+ attributes #6 = { nounwind }
295
+
296
+ !llvm.module.flags = !{!0, !1}
297
+ !llvm.dbg.cu = !{!2}
298
+ !nvvm.annotations = !{!4, !5, !5, !4}
299
+ !llvm.ident = !{!6}
300
+
301
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
302
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
303
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
304
+ !3 = !DIFile(filename: "cybxnh26qvsbmxmvdr54vaav2ezk2qxu7562fhhsn4lvyvqgoglw.py", directory: "/tmp/torchinductor_root/yb")
305
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1}
306
+ !5 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64}
307
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
308
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
309
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
310
+ !9 = !{}
311
+ !10 = !DILocation(line: 26, column: 26, scope: !7)
312
+ !11 = !DILocation(line: 23, column: 28, scope: !7)
313
+ !12 = !DILocation(line: 30, column: 40, scope: !7)
314
+ !13 = !DILocation(line: 30, column: 36, scope: !7)
315
+ !14 = !DILocation(line: 30, column: 30, scope: !7)
316
+ !15 = !DILocation(line: 30, column: 46, scope: !7)
317
+ !16 = !DILocation(line: 31, column: 30, scope: !7)
318
+ !17 = !DILocation(line: 31, column: 46, scope: !7)
319
+ !18 = !DILocation(line: 31, column: 67, scope: !7)
320
+ !19 = !DILocation(line: 32, column: 30, scope: !7)
321
+ !20 = !DILocation(line: 32, column: 46, scope: !7)
322
+ !21 = !DILocation(line: 32, column: 67, scope: !7)
323
+ !22 = !DILocation(line: 33, column: 30, scope: !7)
324
+ !23 = !DILocation(line: 33, column: 46, scope: !7)
325
+ !24 = !DILocation(line: 33, column: 67, scope: !7)
326
+ !25 = !DILocation(line: 34, column: 31, scope: !7)
327
+ !26 = !DILocation(line: 34, column: 47, scope: !7)
328
+ !27 = !DILocation(line: 34, column: 68, scope: !7)
329
+ !28 = !DILocation(line: 35, column: 31, scope: !7)
330
+ !29 = !DILocation(line: 35, column: 36, scope: !7)
331
+ !30 = !DILocation(line: 37, column: 18, scope: !7)
332
+ !31 = !DILocation(line: 39, column: 18, scope: !7)
333
+ !32 = !DILocation(line: 41, column: 18, scope: !7)
334
+ !33 = !DILocation(line: 43, column: 19, scope: !7)
335
+ !34 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !38)
336
+ !35 = distinct !DILexicalBlockFile(scope: !37, file: !36, discriminator: 0)
337
+ !36 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
338
+ !37 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
339
+ !38 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !39)
340
+ !39 = !DILocation(line: 48, column: 59, scope: !35)
341
+ !40 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !41)
342
+ !41 = !DILocation(line: 48, column: 59, scope: !37)
343
+ !42 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !45)
344
+ !43 = distinct !DILexicalBlockFile(scope: !7, file: !44, discriminator: 0)
345
+ !44 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
346
+ !45 = !DILocation(line: 48, column: 45, scope: !43)
347
+ !46 = !DILocation(line: 51, column: 20, scope: !7)
348
+ !47 = !DILocation(line: 52, column: 20, scope: !7)
349
+ !48 = !DILocation(line: 53, column: 20, scope: !7)
350
+ !49 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !50)
351
+ !50 = !DILocation(line: 56, column: 59, scope: !37)
352
+ !51 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !52)
353
+ !52 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !53)
354
+ !53 = !DILocation(line: 56, column: 59, scope: !35)
355
+ !54 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !55)
356
+ !55 = !DILocation(line: 56, column: 45, scope: !43)
357
+ !56 = !DILocation(line: 59, column: 20, scope: !7)
358
+ !57 = !DILocation(line: 61, column: 20, scope: !7)
359
+ !58 = !DILocation(line: 62, column: 26, scope: !7)
360
+ !59 = !DILocation(line: 63, column: 20, scope: !7)
361
+ !60 = !DILocation(line: 64, column: 20, scope: !7)
362
+ !61 = !DILocation(line: 66, column: 25, scope: !7)
363
+ !62 = !DILocation(line: 66, column: 48, scope: !7)
364
+ !63 = !DILocation(line: 67, column: 25, scope: !7)
365
+ !64 = !DILocation(line: 67, column: 48, scope: !7)
366
+ !65 = !DILocation(line: 67, column: 4, scope: !7)
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx ADDED
@@ -0,0 +1,807 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7d8de9de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
12
+
13
+ .visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
14
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
21
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
22
+ .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
23
+ .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
24
+ )
25
+ .maxntid 64, 1, 1
26
+ {
27
+ .reg .pred %p<33>;
28
+ .reg .b16 %rs<21>;
29
+ .reg .b32 %r<112>;
30
+ .reg .f32 %f<94>;
31
+ .reg .b64 %rd<20>;
32
+ .loc 1 18 0
33
+ $L__func_begin0:
34
+ .loc 1 18 0
35
+
36
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
37
+ ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
38
+ $L__tmp0:
39
+ .loc 1 26 26
40
+ mov.u32 %r78, %tid.x;
41
+ and.b32 %r79, %r78, 31;
42
+ ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
43
+ ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
44
+ ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
45
+ shl.b32 %r80, %r78, 2;
46
+ ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
47
+ and.b32 %r81, %r80, 252;
48
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
49
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
50
+ .loc 1 23 28
51
+ mov.u32 %r1, %ctaid.x;
52
+ .loc 1 30 40
53
+ shl.b32 %r82, %r1, 8;
54
+ .loc 1 30 36
55
+ or.b32 %r83, %r82, %r81;
56
+ .loc 1 30 30
57
+ mul.wide.s32 %rd17, %r83, 4;
58
+ add.s64 %rd1, %rd9, %rd17;
59
+ mov.b32 %r6, 0;
60
+ mov.pred %p1, -1;
61
+ .loc 1 30 46
62
+ mov.u32 %r2, 0x0;
63
+ mov.u32 %r3, 0x0;
64
+ mov.u32 %r4, 0x0;
65
+ mov.u32 %r5, 0x0;
66
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
67
+ @!%p1 mov.u32 %r2, %r6;
68
+ @!%p1 mov.u32 %r3, %r6;
69
+ @!%p1 mov.u32 %r4, %r6;
70
+ @!%p1 mov.u32 %r5, %r6;
71
+ mov.b32 %f1, %r2;
72
+ mov.b32 %f2, %r3;
73
+ mov.b32 %f3, %r4;
74
+ mov.b32 %f4, %r5;
75
+ .loc 1 31 30
76
+ mul.wide.s32 %rd18, %r83, 2;
77
+ add.s64 %rd2, %rd10, %rd18;
78
+ .loc 1 31 46
79
+ mov.u32 %r10, 0x0;
80
+ mov.u32 %r11, 0x0;
81
+ @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
82
+ @!%p1 mov.u32 %r10, %r6;
83
+ @!%p1 mov.u32 %r11, %r6;
84
+ cvt.u16.u32 %rs1, %r10;
85
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
86
+ cvt.u16.u32 %rs3, %r11;
87
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
88
+ .loc 1 31 67
89
+ cvt.f32.bf16 %r14, %rs1;
90
+ mov.b32 %f5, %r14;
91
+ cvt.f32.bf16 %r15, %rs2;
92
+ mov.b32 %f6, %r15;
93
+ cvt.f32.bf16 %r16, %rs3;
94
+ mov.b32 %f7, %r16;
95
+ cvt.f32.bf16 %r17, %rs4;
96
+ mov.b32 %f8, %r17;
97
+ .loc 1 32 30
98
+ add.s64 %rd3, %rd11, %rd18;
99
+ .loc 1 32 46
100
+ mov.u32 %r18, 0x0;
101
+ mov.u32 %r19, 0x0;
102
+ @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
103
+ @!%p1 mov.u32 %r18, %r6;
104
+ @!%p1 mov.u32 %r19, %r6;
105
+ cvt.u16.u32 %rs5, %r18;
106
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
107
+ cvt.u16.u32 %rs7, %r19;
108
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
109
+ .loc 1 32 67
110
+ cvt.f32.bf16 %r22, %rs5;
111
+ mov.b32 %f9, %r22;
112
+ cvt.f32.bf16 %r23, %rs6;
113
+ mov.b32 %f10, %r23;
114
+ cvt.f32.bf16 %r24, %rs7;
115
+ mov.b32 %f11, %r24;
116
+ cvt.f32.bf16 %r25, %rs8;
117
+ mov.b32 %f12, %r25;
118
+ .loc 1 33 30
119
+ add.s64 %rd4, %rd12, %rd18;
120
+ .loc 1 33 46
121
+ mov.u32 %r26, 0x0;
122
+ mov.u32 %r27, 0x0;
123
+ @%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
124
+ @!%p1 mov.u32 %r26, %r6;
125
+ @!%p1 mov.u32 %r27, %r6;
126
+ cvt.u16.u32 %rs9, %r26;
127
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
128
+ cvt.u16.u32 %rs11, %r27;
129
+ { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
130
+ .loc 1 33 67
131
+ cvt.f32.bf16 %r30, %rs9;
132
+ mov.b32 %f13, %r30;
133
+ cvt.f32.bf16 %r31, %rs10;
134
+ mov.b32 %f14, %r31;
135
+ cvt.f32.bf16 %r32, %rs11;
136
+ mov.b32 %f15, %r32;
137
+ cvt.f32.bf16 %r33, %rs12;
138
+ mov.b32 %f16, %r33;
139
+ .loc 1 34 31
140
+ add.s64 %rd5, %rd13, %rd18;
141
+ .loc 1 34 47
142
+ mov.u32 %r34, 0x0;
143
+ mov.u32 %r35, 0x0;
144
+ @%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ];
145
+ @!%p1 mov.u32 %r34, %r6;
146
+ @!%p1 mov.u32 %r35, %r6;
147
+ cvt.u16.u32 %rs13, %r34;
148
+ { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; }
149
+ cvt.u16.u32 %rs15, %r35;
150
+ { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; }
151
+ .loc 1 34 68
152
+ cvt.f32.bf16 %r38, %rs13;
153
+ mov.b32 %f17, %r38;
154
+ cvt.f32.bf16 %r39, %rs14;
155
+ mov.b32 %f18, %r39;
156
+ cvt.f32.bf16 %r40, %rs15;
157
+ mov.b32 %f19, %r40;
158
+ cvt.f32.bf16 %r41, %rs16;
159
+ mov.b32 %f20, %r41;
160
+ .loc 1 35 31
161
+ mul.wide.u32 %rd19, %r81, 4;
162
+ add.s64 %rd6, %rd14, %rd19;
163
+ .loc 1 35 36
164
+ mov.u32 %r42, 0x0;
165
+ mov.u32 %r43, 0x0;
166
+ mov.u32 %r44, 0x0;
167
+ mov.u32 %r45, 0x0;
168
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ];
169
+ @!%p1 mov.u32 %r42, %r6;
170
+ @!%p1 mov.u32 %r43, %r6;
171
+ @!%p1 mov.u32 %r44, %r6;
172
+ @!%p1 mov.u32 %r45, %r6;
173
+ .loc 1 37 18
174
+ add.f32 %f21, %f5, %f1;
175
+ add.f32 %f22, %f6, %f2;
176
+ add.f32 %f23, %f7, %f3;
177
+ .loc 1 39 18
178
+ add.f32 %f24, %f21, %f9;
179
+ add.f32 %f25, %f22, %f10;
180
+ add.f32 %f26, %f23, %f11;
181
+ .loc 1 41 18
182
+ add.f32 %f27, %f25, %f14;
183
+ add.f32 %f28, %f26, %f15;
184
+ .loc 1 43 19
185
+ add.f32 %f29, %f27, %f18;
186
+ add.f32 %f30, %f28, %f19;
187
+ .loc 1 41 18
188
+ add.f32 %f31, %f24, %f13;
189
+ add.f32 %f32, %f8, %f4;
190
+ .loc 1 43 19
191
+ add.f32 %f33, %f32, %f12;
192
+ add.f32 %f34, %f31, %f17;
193
+ $L__tmp1:
194
+ .loc 2 233 15
195
+ add.f32 %f35, %f34, %f29;
196
+ add.f32 %f36, %f33, %f16;
197
+ add.f32 %f37, %f35, %f30;
198
+ add.f32 %f38, %f36, %f20;
199
+ mov.b32 %r71, %f38;
200
+ add.f32 %f39, %f37, %f38;
201
+ $L__tmp2:
202
+ .loc 2 243 36
203
+ mov.b32 %r84, %f39;
204
+ shfl.sync.bfly.b32 %r85, %r84, 16, 31, -1;
205
+ mov.b32 %f40, %r85;
206
+ $L__tmp3:
207
+ .loc 2 233 15
208
+ add.f32 %f41, %f39, %f40;
209
+ $L__tmp4:
210
+ .loc 2 243 36
211
+ mov.b32 %r86, %f41;
212
+ shfl.sync.bfly.b32 %r87, %r86, 8, 31, -1;
213
+ mov.b32 %f42, %r87;
214
+ $L__tmp5:
215
+ .loc 2 233 15
216
+ add.f32 %f43, %f41, %f42;
217
+ $L__tmp6:
218
+ .loc 2 243 36
219
+ mov.b32 %r88, %f43;
220
+ shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1;
221
+ mov.b32 %f44, %r89;
222
+ $L__tmp7:
223
+ .loc 2 233 15
224
+ add.f32 %f45, %f43, %f44;
225
+ $L__tmp8:
226
+ .loc 2 243 36
227
+ mov.b32 %r90, %f45;
228
+ shfl.sync.bfly.b32 %r91, %r90, 2, 31, -1;
229
+ mov.b32 %f46, %r91;
230
+ $L__tmp9:
231
+ .loc 2 233 15
232
+ add.f32 %f47, %f45, %f46;
233
+ $L__tmp10:
234
+ .loc 2 243 36
235
+ mov.b32 %r92, %f47;
236
+ shfl.sync.bfly.b32 %r93, %r92, 1, 31, -1;
237
+ mov.b32 %f48, %r93;
238
+ $L__tmp11:
239
+ .loc 2 233 15
240
+ add.f32 %f49, %f47, %f48;
241
+ $L__tmp12:
242
+ .loc 2 243 36
243
+ setp.eq.s32 %p23, %r79, 0;
244
+ shr.u32 %r94, %r78, 3;
245
+ and.b32 %r95, %r94, 4;
246
+ mov.u32 %r96, global_smem;
247
+ add.s32 %r50, %r96, %r95;
248
+ mov.b32 %r51, %f49;
249
+ @%p23 st.shared.b32 [ %r50 + 0 ], %r51;
250
+ bar.sync 0;
251
+ setp.lt.s32 %p24, %r78, 2;
252
+ add.s32 %r53, %r96, %r80;
253
+ @%p24 ld.shared.b32 %r52, [ %r53 + 0 ];
254
+ mov.b32 %f50, %r52;
255
+ shfl.sync.bfly.b32 %r97, %r52, 1, 31, -1;
256
+ mov.b32 %f51, %r97;
257
+ $L__tmp13:
258
+ .loc 2 233 15
259
+ add.f32 %f52, %f50, %f51;
260
+ $L__tmp14:
261
+ .loc 2 243 36
262
+ and.b32 %r98, %r78, 1;
263
+ setp.eq.b32 %p31, %r98, 1;
264
+ not.pred %p32, %p31;
265
+ and.pred %p25, %p24, %p32;
266
+ mov.b32 %r55, %f52;
267
+ @%p25 st.shared.b32 [ %r53 + 0 ], %r55;
268
+ bar.sync 0;
269
+ ld.shared.f32 %f53, [global_smem];
270
+ $L__tmp15:
271
+ .loc 3 8 15
272
+ add.f32 %f54, %f53, 0f00000000;
273
+ $L__tmp16:
274
+ .loc 1 51 20
275
+ mov.b32 %r57, %f54;
276
+ mov.b32 %r58, 1132462080;
277
+ div.full.f32 %r56, %r57, %r58;
278
+ mov.b32 %f55, %r56;
279
+ .loc 1 52 20
280
+ sub.f32 %f56, %f34, %f55;
281
+ sub.f32 %f57, %f29, %f55;
282
+ sub.f32 %f58, %f30, %f55;
283
+ sub.f32 %f59, %f38, %f55;
284
+ .loc 1 53 20
285
+ mul.f32 %f60, %f57, %f57;
286
+ $L__tmp17:
287
+ .loc 2 243 36
288
+ bar.sync 0;
289
+ $L__tmp18:
290
+ .loc 2 233 15
291
+ fma.rn.f32 %f61, %f56, %f56, %f60;
292
+ fma.rn.f32 %f62, %f58, %f58, %f61;
293
+ fma.rn.f32 %f63, %f59, %f59, %f62;
294
+ $L__tmp19:
295
+ .loc 2 243 36
296
+ mov.b32 %r99, %f63;
297
+ shfl.sync.bfly.b32 %r100, %r99, 16, 31, -1;
298
+ mov.b32 %f64, %r100;
299
+ $L__tmp20:
300
+ .loc 2 233 15
301
+ add.f32 %f65, %f63, %f64;
302
+ $L__tmp21:
303
+ .loc 2 243 36
304
+ mov.b32 %r101, %f65;
305
+ shfl.sync.bfly.b32 %r102, %r101, 8, 31, -1;
306
+ mov.b32 %f66, %r102;
307
+ $L__tmp22:
308
+ .loc 2 233 15
309
+ add.f32 %f67, %f65, %f66;
310
+ $L__tmp23:
311
+ .loc 2 243 36
312
+ mov.b32 %r103, %f67;
313
+ shfl.sync.bfly.b32 %r104, %r103, 4, 31, -1;
314
+ mov.b32 %f68, %r104;
315
+ $L__tmp24:
316
+ .loc 2 233 15
317
+ add.f32 %f69, %f67, %f68;
318
+ $L__tmp25:
319
+ .loc 2 243 36
320
+ mov.b32 %r105, %f69;
321
+ shfl.sync.bfly.b32 %r106, %r105, 2, 31, -1;
322
+ mov.b32 %f70, %r106;
323
+ $L__tmp26:
324
+ .loc 2 233 15
325
+ add.f32 %f71, %f69, %f70;
326
+ $L__tmp27:
327
+ .loc 2 243 36
328
+ mov.b32 %r107, %f71;
329
+ shfl.sync.bfly.b32 %r108, %r107, 1, 31, -1;
330
+ mov.b32 %f72, %r108;
331
+ $L__tmp28:
332
+ .loc 2 233 15
333
+ add.f32 %f73, %f71, %f72;
334
+ $L__tmp29:
335
+ .loc 2 243 36
336
+ mov.b32 %r60, %f73;
337
+ @%p23 st.shared.b32 [ %r50 + 0 ], %r60;
338
+ bar.sync 0;
339
+ @%p24 ld.shared.b32 %r61, [ %r53 + 0 ];
340
+ mov.b32 %f74, %r61;
341
+ shfl.sync.bfly.b32 %r109, %r61, 1, 31, -1;
342
+ mov.b32 %f75, %r109;
343
+ $L__tmp30:
344
+ .loc 2 233 15
345
+ add.f32 %f76, %f74, %f75;
346
+ $L__tmp31:
347
+ .loc 2 243 36
348
+ mov.b32 %r64, %f76;
349
+ @%p25 st.shared.b32 [ %r53 + 0 ], %r64;
350
+ bar.sync 0;
351
+ ld.shared.f32 %f77, [global_smem];
352
+ $L__tmp32:
353
+ .loc 3 8 15
354
+ add.f32 %f78, %f77, 0f00000000;
355
+ $L__tmp33:
356
+ .loc 1 59 20
357
+ mov.b32 %r66, %f78;
358
+ div.full.f32 %r65, %r66, %r58;
359
+ mov.b32 %f79, %r65;
360
+ .loc 1 61 20
361
+ add.f32 %f80, %f79, 0f3727C5AC;
362
+ .loc 1 62 26
363
+ rsqrt.approx.ftz.f32 %f81, %f80;
364
+ .loc 1 35 36
365
+ mov.b32 %f82, %r45;
366
+ mov.b32 %f83, %r44;
367
+ mov.b32 %f84, %r43;
368
+ mov.b32 %f85, %r42;
369
+ .loc 1 63 20
370
+ mul.f32 %f86, %f56, %f81;
371
+ mul.f32 %f87, %f57, %f81;
372
+ mul.f32 %f88, %f58, %f81;
373
+ mul.f32 %f89, %f59, %f81;
374
+ .loc 1 64 20
375
+ mul.f32 %f90, %f86, %f85;
376
+ mul.f32 %f91, %f87, %f84;
377
+ mul.f32 %f92, %f88, %f83;
378
+ mul.f32 %f93, %f89, %f82;
379
+ .loc 1 66 25
380
+ add.s64 %rd7, %rd15, %rd17;
381
+ .loc 1 66 48
382
+ mov.b32 %r68, %f34;
383
+ mov.b32 %r69, %f29;
384
+ mov.b32 %r70, %f30;
385
+ @%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 };
386
+ .loc 1 67 25
387
+ add.s64 %rd8, %rd16, %rd18;
388
+ .loc 1 67 48
389
+ mov.b32 %r72, %f90;
390
+ cvt.rn.bf16.f32 %rs17, %r72;
391
+ mov.b32 %r73, %f91;
392
+ cvt.rn.bf16.f32 %rs18, %r73;
393
+ mov.b32 %r74, %f92;
394
+ cvt.rn.bf16.f32 %rs19, %r74;
395
+ mov.b32 %r75, %f93;
396
+ cvt.rn.bf16.f32 %rs20, %r75;
397
+ mov.b32 %r110, {%rs17, %rs18};
398
+ mov.b32 %r111, {%rs19, %rs20};
399
+ @%p1 st.global.v2.b32 [ %rd8 + 0 ], { %r110, %r111 };
400
+ .loc 1 67 4
401
+ ret;
402
+ $L__tmp34:
403
+ $L__func_end0:
404
+
405
+ }
406
+ // .globl __nv_rsqrtf
407
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
408
+ .param .b32 __nv_rsqrtf_param_0
409
+ )
410
+ {
411
+ .reg .f32 %f<3>;
412
+ $L__func_begin1:
413
+
414
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
415
+ rsqrt.approx.ftz.f32 %f2, %f1;
416
+ st.param.f32 [func_retval0+0], %f2;
417
+ ret;
418
+ $L__func_end1:
419
+
420
+ }
421
+ .file 1 "/tmp/torchinductor_root/yb/cybxnh26qvsbmxmvdr54vaav2ezk2qxu7562fhhsn4lvyvqgoglw.py"
422
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
423
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
424
+ .section .debug_abbrev
425
+ {
426
+ .b8 1
427
+ .b8 17
428
+ .b8 1
429
+ .b8 37
430
+ .b8 8
431
+ .b8 19
432
+ .b8 5
433
+ .b8 3
434
+ .b8 8
435
+ .b8 16
436
+ .b8 6
437
+ .b8 27
438
+ .b8 8
439
+ .b8 180
440
+ .b8 66
441
+ .b8 12
442
+ .b8 17
443
+ .b8 1
444
+ .b8 18
445
+ .b8 1
446
+ .b8 0
447
+ .b8 0
448
+ .b8 2
449
+ .b8 46
450
+ .b8 0
451
+ .b8 135
452
+ .b8 64
453
+ .b8 8
454
+ .b8 3
455
+ .b8 8
456
+ .b8 58
457
+ .b8 11
458
+ .b8 59
459
+ .b8 11
460
+ .b8 63
461
+ .b8 12
462
+ .b8 32
463
+ .b8 11
464
+ .b8 0
465
+ .b8 0
466
+ .b8 3
467
+ .b8 46
468
+ .b8 1
469
+ .b8 17
470
+ .b8 1
471
+ .b8 18
472
+ .b8 1
473
+ .b8 64
474
+ .b8 10
475
+ .b8 49
476
+ .b8 19
477
+ .b8 0
478
+ .b8 0
479
+ .b8 4
480
+ .b8 29
481
+ .b8 1
482
+ .b8 49
483
+ .b8 19
484
+ .b8 17
485
+ .b8 1
486
+ .b8 18
487
+ .b8 1
488
+ .b8 88
489
+ .b8 11
490
+ .b8 89
491
+ .b8 11
492
+ .b8 87
493
+ .b8 11
494
+ .b8 0
495
+ .b8 0
496
+ .b8 5
497
+ .b8 29
498
+ .b8 0
499
+ .b8 49
500
+ .b8 19
501
+ .b8 17
502
+ .b8 1
503
+ .b8 18
504
+ .b8 1
505
+ .b8 88
506
+ .b8 11
507
+ .b8 89
508
+ .b8 11
509
+ .b8 87
510
+ .b8 11
511
+ .b8 0
512
+ .b8 0
513
+ .b8 0
514
+ }
515
+ .section .debug_info
516
+ {
517
+ .b32 407
518
+ .b8 2
519
+ .b8 0
520
+ .b32 .debug_abbrev
521
+ .b8 8
522
+ .b8 1
523
+ .b8 116
524
+ .b8 114
525
+ .b8 105
526
+ .b8 116
527
+ .b8 111
528
+ .b8 110
529
+ .b8 0
530
+ .b8 2
531
+ .b8 0
532
+ .b8 99
533
+ .b8 121
534
+ .b8 98
535
+ .b8 120
536
+ .b8 110
537
+ .b8 104
538
+ .b8 50
539
+ .b8 54
540
+ .b8 113
541
+ .b8 118
542
+ .b8 115
543
+ .b8 98
544
+ .b8 109
545
+ .b8 120
546
+ .b8 109
547
+ .b8 118
548
+ .b8 100
549
+ .b8 114
550
+ .b8 53
551
+ .b8 52
552
+ .b8 118
553
+ .b8 97
554
+ .b8 97
555
+ .b8 118
556
+ .b8 50
557
+ .b8 101
558
+ .b8 122
559
+ .b8 107
560
+ .b8 50
561
+ .b8 113
562
+ .b8 120
563
+ .b8 117
564
+ .b8 55
565
+ .b8 53
566
+ .b8 54
567
+ .b8 50
568
+ .b8 102
569
+ .b8 104
570
+ .b8 104
571
+ .b8 115
572
+ .b8 110
573
+ .b8 52
574
+ .b8 108
575
+ .b8 118
576
+ .b8 121
577
+ .b8 118
578
+ .b8 113
579
+ .b8 103
580
+ .b8 111
581
+ .b8 103
582
+ .b8 108
583
+ .b8 119
584
+ .b8 46
585
+ .b8 112
586
+ .b8 121
587
+ .b8 0
588
+ .b32 .debug_line
589
+ .b8 47
590
+ .b8 116
591
+ .b8 109
592
+ .b8 112
593
+ .b8 47
594
+ .b8 116
595
+ .b8 111
596
+ .b8 114
597
+ .b8 99
598
+ .b8 104
599
+ .b8 105
600
+ .b8 110
601
+ .b8 100
602
+ .b8 117
603
+ .b8 99
604
+ .b8 116
605
+ .b8 111
606
+ .b8 114
607
+ .b8 95
608
+ .b8 114
609
+ .b8 111
610
+ .b8 111
611
+ .b8 116
612
+ .b8 47
613
+ .b8 121
614
+ .b8 98
615
+ .b8 0
616
+ .b8 1
617
+ .b64 $L__func_begin0
618
+ .b64 $L__func_end0
619
+ .b8 2
620
+ .b8 116
621
+ .b8 114
622
+ .b8 105
623
+ .b8 116
624
+ .b8 111
625
+ .b8 110
626
+ .b8 95
627
+ .b8 95
628
+ .b8 48
629
+ .b8 100
630
+ .b8 49
631
+ .b8 100
632
+ .b8 50
633
+ .b8 100
634
+ .b8 51
635
+ .b8 100
636
+ .b8 52
637
+ .b8 100
638
+ .b8 53
639
+ .b8 100
640
+ .b8 54
641
+ .b8 100
642
+ .b8 55
643
+ .b8 100
644
+ .b8 56
645
+ .b8 100
646
+ .b8 101
647
+ .b8 57
648
+ .b8 100
649
+ .b8 101
650
+ .b8 0
651
+ .b8 116
652
+ .b8 114
653
+ .b8 105
654
+ .b8 116
655
+ .b8 111
656
+ .b8 110
657
+ .b8 95
658
+ .b8 95
659
+ .b8 48
660
+ .b8 100
661
+ .b8 49
662
+ .b8 100
663
+ .b8 50
664
+ .b8 100
665
+ .b8 51
666
+ .b8 100
667
+ .b8 52
668
+ .b8 100
669
+ .b8 53
670
+ .b8 100
671
+ .b8 54
672
+ .b8 100
673
+ .b8 55
674
+ .b8 100
675
+ .b8 56
676
+ .b8 100
677
+ .b8 101
678
+ .b8 57
679
+ .b8 100
680
+ .b8 101
681
+ .b8 0
682
+ .b8 1
683
+ .b8 18
684
+ .b8 1
685
+ .b8 1
686
+ .b8 3
687
+ .b64 $L__func_begin0
688
+ .b64 $L__func_end0
689
+ .b8 1
690
+ .b8 156
691
+ .b32 125
692
+ .b8 4
693
+ .b32 125
694
+ .b64 $L__tmp1
695
+ .b64 $L__tmp14
696
+ .b8 2
697
+ .b8 48
698
+ .b8 59
699
+ .b8 5
700
+ .b32 125
701
+ .b64 $L__tmp1
702
+ .b64 $L__tmp14
703
+ .b8 2
704
+ .b8 243
705
+ .b8 36
706
+ .b8 0
707
+ .b8 5
708
+ .b32 125
709
+ .b64 $L__tmp2
710
+ .b64 $L__tmp15
711
+ .b8 2
712
+ .b8 48
713
+ .b8 59
714
+ .b8 5
715
+ .b32 125
716
+ .b64 $L__tmp15
717
+ .b64 $L__tmp16
718
+ .b8 3
719
+ .b8 48
720
+ .b8 45
721
+ .b8 5
722
+ .b32 125
723
+ .b64 $L__tmp17
724
+ .b64 $L__tmp32
725
+ .b8 2
726
+ .b8 56
727
+ .b8 59
728
+ .b8 4
729
+ .b32 125
730
+ .b64 $L__tmp18
731
+ .b64 $L__tmp31
732
+ .b8 2
733
+ .b8 56
734
+ .b8 59
735
+ .b8 5
736
+ .b32 125
737
+ .b64 $L__tmp18
738
+ .b64 $L__tmp31
739
+ .b8 2
740
+ .b8 243
741
+ .b8 36
742
+ .b8 0
743
+ .b8 5
744
+ .b32 125
745
+ .b64 $L__tmp32
746
+ .b64 $L__tmp33
747
+ .b8 3
748
+ .b8 56
749
+ .b8 45
750
+ .b8 0
751
+ .b8 0
752
+ }
753
+ .section .debug_pubnames
754
+ {
755
+ .b32 $L__pubNames_end0-$L__pubNames_start0
756
+ $L__pubNames_start0:
757
+ .b8 2
758
+ .b8 0
759
+ .b32 .debug_info
760
+ .b32 411
761
+ .b32 125
762
+ .b8 116
763
+ .b8 114
764
+ .b8 105
765
+ .b8 116
766
+ .b8 111
767
+ .b8 110
768
+ .b8 95
769
+ .b8 95
770
+ .b8 48
771
+ .b8 100
772
+ .b8 49
773
+ .b8 100
774
+ .b8 50
775
+ .b8 100
776
+ .b8 51
777
+ .b8 100
778
+ .b8 52
779
+ .b8 100
780
+ .b8 53
781
+ .b8 100
782
+ .b8 54
783
+ .b8 100
784
+ .b8 55
785
+ .b8 100
786
+ .b8 56
787
+ .b8 100
788
+ .b8 101
789
+ .b8 57
790
+ .b8 100
791
+ .b8 101
792
+ .b8 0
793
+ .b32 0
794
+ $L__pubNames_end0:
795
+ }
796
+ .section .debug_pubtypes
797
+ {
798
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
799
+ $L__pubTypes_start0:
800
+ .b8 2
801
+ .b8 0
802
+ .b32 .debug_info
803
+ .b32 411
804
+ .b32 0
805
+ $L__pubTypes_end0:
806
+ }
807
+ .section .debug_loc { }
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant 9.99999974E-6 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 0.000000e+00 : f32
8
+ %c256_i32 = arith.constant 256 : i32
9
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
20
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
21
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
22
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
23
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
27
+ %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
28
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
29
+ %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
30
+ %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
31
+ %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
32
+ %21 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
33
+ %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
34
+ %23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
35
+ %24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
36
+ %25 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
37
+ %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
38
+ %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
39
+ %28 = arith.addf %8, %12 : tensor<256xf32, #blocked>
40
+ %29 = arith.addf %28, %16 : tensor<256xf32, #blocked>
41
+ %30 = arith.addf %29, %20 : tensor<256xf32, #blocked>
42
+ %31 = arith.addf %30, %24 : tensor<256xf32, #blocked>
43
+ %32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
44
+ %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
45
+ ^bb0(%arg10: f32, %arg11: f32):
46
+ %53 = arith.addf %arg10, %arg11 : f32
47
+ tt.reduce.return %53 : f32
48
+ }) : (tensor<256xf32, #blocked>) -> f32
49
+ %34 = arith.addf %33, %cst_2 : f32
50
+ %35 = arith.divf %34, %cst_1 : f32
51
+ %36 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
52
+ %37 = arith.subf %31, %36 : tensor<256xf32, #blocked>
53
+ %38 = arith.mulf %37, %37 : tensor<256xf32, #blocked>
54
+ %39 = arith.select %2, %38, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
55
+ %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({
56
+ ^bb0(%arg10: f32, %arg11: f32):
57
+ %53 = arith.addf %arg10, %arg11 : f32
58
+ tt.reduce.return %53 : f32
59
+ }) : (tensor<256xf32, #blocked>) -> f32
60
+ %41 = arith.addf %40, %cst_2 : f32
61
+ %42 = arith.divf %41, %cst_1 : f32
62
+ %43 = arith.addf %42, %cst_0 : f32
63
+ %44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
64
+ %45 = tt.splat %44 : (f32) -> tensor<256xf32, #blocked>
65
+ %46 = arith.mulf %37, %45 : tensor<256xf32, #blocked>
66
+ %47 = arith.mulf %46, %27 : tensor<256xf32, #blocked>
67
+ %48 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
68
+ %49 = tt.addptr %48, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
69
+ tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
70
+ %50 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
71
+ %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
72
+ %52 = arith.truncf %47 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
73
+ tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
74
+ tt.return
75
+ }
76
+ }
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 9.99999974E-6 : f32
8
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
20
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
21
+ %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
22
+ %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
23
+ %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
24
+ %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
26
+ %16 = arith.addf %8, %12 : tensor<256xf32>
27
+ %17 = arith.select %2, %16, %cst_3 : tensor<256xi1>, tensor<256xf32>
28
+ %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
29
+ ^bb0(%arg6: f32, %arg7: f32):
30
+ %36 = arith.addf %arg6, %arg7 : f32
31
+ tt.reduce.return %36 : f32
32
+ }) : (tensor<256xf32>) -> f32
33
+ %19 = arith.addf %18, %cst_0 : f32
34
+ %20 = arith.divf %19, %cst_1 : f32
35
+ %21 = tt.splat %20 : (f32) -> tensor<256xf32>
36
+ %22 = arith.subf %16, %21 : tensor<256xf32>
37
+ %23 = arith.mulf %22, %22 : tensor<256xf32>
38
+ %24 = arith.select %2, %23, %cst_3 : tensor<256xi1>, tensor<256xf32>
39
+ %25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({
40
+ ^bb0(%arg6: f32, %arg7: f32):
41
+ %36 = arith.addf %arg6, %arg7 : f32
42
+ tt.reduce.return %36 : f32
43
+ }) : (tensor<256xf32>) -> f32
44
+ %26 = arith.addf %25, %cst_0 : f32
45
+ %27 = arith.divf %26, %cst_1 : f32
46
+ %28 = arith.addf %27, %cst_2 : f32
47
+ %29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
48
+ %30 = tt.splat %29 : (f32) -> tensor<256xf32>
49
+ %31 = arith.mulf %22, %30 : tensor<256xf32>
50
+ %32 = arith.mulf %31, %15 : tensor<256xf32>
51
+ %33 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
52
+ %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
53
+ %35 = arith.truncf %32 : tensor<256xf32> to tensor<256xbf16>
54
+ tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
55
+ tt.return
56
+ }
57
+ }
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin ADDED
Binary file (23.9 kB). View file
 
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx ADDED
@@ -0,0 +1,764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1de
10
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
11
+
12
+ .visible .entry triton__0d1de(
13
+ .param .u64 triton__0d1de_param_0,
14
+ .param .u32 triton__0d1de_param_1
15
+ )
16
+ .maxntid 128, 1, 1
17
+ {
18
+ .reg .pred %p<27>;
19
+ .reg .b16 %rs<17>;
20
+ .reg .b32 %r<67>;
21
+ .reg .f32 %f<431>;
22
+ .reg .b64 %rd<6>;
23
+ .loc 1 18 0
24
+ $L__func_begin0:
25
+ .loc 1 18 0
26
+
27
+ ld.param.u64 %rd3, [triton__0d1de_param_0];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r14, %tid.x;
31
+ shl.b32 %r15, %r14, 3;
32
+ and.b32 %r16, %r15, 1016;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r17, %r1, 10;
37
+ .loc 1 21 23
38
+ or.b32 %r18, %r17, %r16;
39
+ .loc 1 24 34
40
+ mul.wide.s32 %rd4, %r18, 2;
41
+ add.s64 %rd5, %rd3, %rd4;
42
+ mov.pred %p1, -1;
43
+ .loc 1 24 39
44
+ mov.u32 %r2, 0x0;
45
+ mov.u32 %r3, 0x0;
46
+ mov.u32 %r4, 0x0;
47
+ mov.u32 %r5, 0x0;
48
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd5 + 0 ];
49
+ cvt.u16.u32 %rs1, %r2;
50
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
51
+ cvt.u16.u32 %rs3, %r3;
52
+ .loc 1 24 48
53
+ cvt.f32.bf16 %r6, %rs1;
54
+ mov.b32 %f1, %r6;
55
+ cvt.f32.bf16 %r7, %rs2;
56
+ mov.b32 %f2, %r7;
57
+ .loc 1 29 18
58
+ mul.f32 %f9, %f1, 0f3F3504F3;
59
+ .loc 1 30 23
60
+ abs.ftz.f32 %f17, %f9;
61
+ setp.ge.f32 %p2, %f17, 0f3F8060FE;
62
+ mov.f32 %f365, 0f3789CA3C;
63
+ mov.f32 %f364, 0fB9F560B9;
64
+ mov.f32 %f363, 0f3BAC840B;
65
+ mov.f32 %f362, 0fBD0C8162;
66
+ mov.f32 %f361, 0f3E1CF906;
67
+ mov.f32 %f360, 0f3F6A937E;
68
+ mov.f32 %f359, 0f3F20D842;
69
+ mov.f32 %f366, %f17;
70
+ @%p2 bra $L__BB0_2;
71
+ .loc 1 0 23
72
+ mov.f32 %f365, 0f38B1E96A;
73
+ mov.f32 %f364, 0fBA574D20;
74
+ mov.f32 %f363, 0f3BAAD5EA;
75
+ mov.f32 %f362, 0fBCDC1BE7;
76
+ mov.f32 %f361, 0f3DE718AF;
77
+ mov.f32 %f360, 0fBEC093AC;
78
+ mov.f32 %f359, 0f3E0375D3;
79
+ .loc 1 30 23
80
+ mul.f32 %f366, %f9, %f9;
81
+ $L__BB0_2:
82
+ .loc 1 0 0
83
+ cvt.f32.bf16 %r8, %rs3;
84
+ mul.f32 %f10, %f2, 0f3F3504F3;
85
+ .loc 1 30 23
86
+ setp.ltu.f32 %p3, %f17, 0f3F8060FE;
87
+ fma.rn.ftz.f32 %f135, %f365, %f366, %f364;
88
+ fma.rn.ftz.f32 %f136, %f135, %f366, %f363;
89
+ fma.rn.ftz.f32 %f137, %f136, %f366, %f362;
90
+ fma.rn.ftz.f32 %f138, %f137, %f366, %f361;
91
+ fma.rn.ftz.f32 %f139, %f138, %f366, %f360;
92
+ fma.rn.ftz.f32 %f140, %f139, %f366, %f359;
93
+ neg.f32 %f141, %f366;
94
+ selp.f32 %f142, %f141, %f9, %p2;
95
+ fma.rn.ftz.f32 %f367, %f140, %f142, %f142;
96
+ mov.f32 %f358, 0f3F800000;
97
+ @%p3 bra $L__BB0_4;
98
+ ex2.approx.ftz.f32 %f143, %f367;
99
+ sub.f32 %f145, %f358, %f143;
100
+ mov.b32 %r19, %f145;
101
+ mov.b32 %r20, %f9;
102
+ and.b32 %r21, %r20, -2147483648;
103
+ or.b32 %r22, %r21, %r19;
104
+ mov.b32 %f367, %r22;
105
+ $L__BB0_4:
106
+ .loc 1 0 0
107
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
108
+ mov.b32 %f3, %r8;
109
+ .loc 1 30 23
110
+ abs.ftz.f32 %f30, %f10;
111
+ setp.ge.f32 %p5, %f30, 0f3F8060FE;
112
+ mov.f32 %f374, 0f3789CA3C;
113
+ mov.f32 %f373, 0fB9F560B9;
114
+ mov.f32 %f372, 0f3BAC840B;
115
+ mov.f32 %f371, 0fBD0C8162;
116
+ mov.f32 %f370, 0f3E1CF906;
117
+ mov.f32 %f369, 0f3F6A937E;
118
+ mov.f32 %f368, 0f3F20D842;
119
+ mov.f32 %f375, %f30;
120
+ @%p5 bra $L__BB0_6;
121
+ mul.f32 %f375, %f10, %f10;
122
+ mov.f32 %f374, 0f38B1E96A;
123
+ mov.f32 %f373, 0fBA574D20;
124
+ mov.f32 %f372, 0f3BAAD5EA;
125
+ mov.f32 %f371, 0fBCDC1BE7;
126
+ mov.f32 %f370, 0f3DE718AF;
127
+ mov.f32 %f369, 0fBEC093AC;
128
+ mov.f32 %f368, 0f3E0375D3;
129
+ $L__BB0_6:
130
+ .loc 1 0 0
131
+ cvt.f32.bf16 %r9, %rs4;
132
+ mul.f32 %f11, %f3, 0f3F3504F3;
133
+ .loc 1 30 23
134
+ setp.ltu.f32 %p6, %f30, 0f3F8060FE;
135
+ fma.rn.ftz.f32 %f160, %f374, %f375, %f373;
136
+ fma.rn.ftz.f32 %f161, %f160, %f375, %f372;
137
+ fma.rn.ftz.f32 %f162, %f161, %f375, %f371;
138
+ fma.rn.ftz.f32 %f163, %f162, %f375, %f370;
139
+ fma.rn.ftz.f32 %f164, %f163, %f375, %f369;
140
+ fma.rn.ftz.f32 %f165, %f164, %f375, %f368;
141
+ neg.f32 %f166, %f375;
142
+ selp.f32 %f167, %f166, %f10, %p5;
143
+ fma.rn.ftz.f32 %f376, %f165, %f167, %f167;
144
+ @%p6 bra $L__BB0_8;
145
+ ex2.approx.ftz.f32 %f168, %f376;
146
+ sub.f32 %f170, %f358, %f168;
147
+ mov.b32 %r23, %f170;
148
+ mov.b32 %r24, %f10;
149
+ and.b32 %r25, %r24, -2147483648;
150
+ or.b32 %r26, %r25, %r23;
151
+ mov.b32 %f376, %r26;
152
+ $L__BB0_8:
153
+ .loc 1 0 0
154
+ cvt.u16.u32 %rs5, %r4;
155
+ mov.b32 %f4, %r9;
156
+ .loc 1 30 23
157
+ abs.ftz.f32 %f43, %f11;
158
+ setp.ge.f32 %p8, %f43, 0f3F8060FE;
159
+ mov.f32 %f383, 0f3789CA3C;
160
+ mov.f32 %f382, 0fB9F560B9;
161
+ mov.f32 %f381, 0f3BAC840B;
162
+ mov.f32 %f380, 0fBD0C8162;
163
+ mov.f32 %f379, 0f3E1CF906;
164
+ mov.f32 %f378, 0f3F6A937E;
165
+ mov.f32 %f377, 0f3F20D842;
166
+ mov.f32 %f384, %f43;
167
+ @%p8 bra $L__BB0_10;
168
+ mul.f32 %f384, %f11, %f11;
169
+ mov.f32 %f383, 0f38B1E96A;
170
+ mov.f32 %f382, 0fBA574D20;
171
+ mov.f32 %f381, 0f3BAAD5EA;
172
+ mov.f32 %f380, 0fBCDC1BE7;
173
+ mov.f32 %f379, 0f3DE718AF;
174
+ mov.f32 %f378, 0fBEC093AC;
175
+ mov.f32 %f377, 0f3E0375D3;
176
+ $L__BB0_10:
177
+ .loc 1 0 0
178
+ cvt.f32.bf16 %r10, %rs5;
179
+ mul.f32 %f12, %f4, 0f3F3504F3;
180
+ .loc 1 30 23
181
+ setp.ltu.f32 %p9, %f43, 0f3F8060FE;
182
+ fma.rn.ftz.f32 %f185, %f383, %f384, %f382;
183
+ fma.rn.ftz.f32 %f186, %f185, %f384, %f381;
184
+ fma.rn.ftz.f32 %f187, %f186, %f384, %f380;
185
+ fma.rn.ftz.f32 %f188, %f187, %f384, %f379;
186
+ fma.rn.ftz.f32 %f189, %f188, %f384, %f378;
187
+ fma.rn.ftz.f32 %f190, %f189, %f384, %f377;
188
+ neg.f32 %f191, %f384;
189
+ selp.f32 %f192, %f191, %f11, %p8;
190
+ fma.rn.ftz.f32 %f385, %f190, %f192, %f192;
191
+ @%p9 bra $L__BB0_12;
192
+ ex2.approx.ftz.f32 %f193, %f385;
193
+ sub.f32 %f195, %f358, %f193;
194
+ mov.b32 %r27, %f195;
195
+ mov.b32 %r28, %f11;
196
+ and.b32 %r29, %r28, -2147483648;
197
+ or.b32 %r30, %r29, %r27;
198
+ mov.b32 %f385, %r30;
199
+ $L__BB0_12:
200
+ .loc 1 0 0
201
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
202
+ mov.b32 %f5, %r10;
203
+ .loc 1 30 23
204
+ abs.ftz.f32 %f56, %f12;
205
+ setp.ge.f32 %p11, %f56, 0f3F8060FE;
206
+ mov.f32 %f392, 0f3789CA3C;
207
+ mov.f32 %f391, 0fB9F560B9;
208
+ mov.f32 %f390, 0f3BAC840B;
209
+ mov.f32 %f389, 0fBD0C8162;
210
+ mov.f32 %f388, 0f3E1CF906;
211
+ mov.f32 %f387, 0f3F6A937E;
212
+ mov.f32 %f386, 0f3F20D842;
213
+ mov.f32 %f393, %f56;
214
+ @%p11 bra $L__BB0_14;
215
+ mul.f32 %f393, %f12, %f12;
216
+ mov.f32 %f392, 0f38B1E96A;
217
+ mov.f32 %f391, 0fBA574D20;
218
+ mov.f32 %f390, 0f3BAAD5EA;
219
+ mov.f32 %f389, 0fBCDC1BE7;
220
+ mov.f32 %f388, 0f3DE718AF;
221
+ mov.f32 %f387, 0fBEC093AC;
222
+ mov.f32 %f386, 0f3E0375D3;
223
+ $L__BB0_14:
224
+ .loc 1 0 0
225
+ cvt.f32.bf16 %r11, %rs6;
226
+ mul.f32 %f13, %f5, 0f3F3504F3;
227
+ .loc 1 30 23
228
+ setp.ltu.f32 %p12, %f56, 0f3F8060FE;
229
+ fma.rn.ftz.f32 %f210, %f392, %f393, %f391;
230
+ fma.rn.ftz.f32 %f211, %f210, %f393, %f390;
231
+ fma.rn.ftz.f32 %f212, %f211, %f393, %f389;
232
+ fma.rn.ftz.f32 %f213, %f212, %f393, %f388;
233
+ fma.rn.ftz.f32 %f214, %f213, %f393, %f387;
234
+ fma.rn.ftz.f32 %f215, %f214, %f393, %f386;
235
+ neg.f32 %f216, %f393;
236
+ selp.f32 %f217, %f216, %f12, %p11;
237
+ fma.rn.ftz.f32 %f394, %f215, %f217, %f217;
238
+ @%p12 bra $L__BB0_16;
239
+ ex2.approx.ftz.f32 %f218, %f394;
240
+ sub.f32 %f220, %f358, %f218;
241
+ mov.b32 %r31, %f220;
242
+ mov.b32 %r32, %f12;
243
+ and.b32 %r33, %r32, -2147483648;
244
+ or.b32 %r34, %r33, %r31;
245
+ mov.b32 %f394, %r34;
246
+ $L__BB0_16:
247
+ .loc 1 0 0
248
+ cvt.u16.u32 %rs7, %r5;
249
+ mov.b32 %f6, %r11;
250
+ .loc 1 30 23
251
+ abs.ftz.f32 %f69, %f13;
252
+ setp.ge.f32 %p14, %f69, 0f3F8060FE;
253
+ mov.f32 %f401, 0f3789CA3C;
254
+ mov.f32 %f400, 0fB9F560B9;
255
+ mov.f32 %f399, 0f3BAC840B;
256
+ mov.f32 %f398, 0fBD0C8162;
257
+ mov.f32 %f397, 0f3E1CF906;
258
+ mov.f32 %f396, 0f3F6A937E;
259
+ mov.f32 %f395, 0f3F20D842;
260
+ mov.f32 %f402, %f69;
261
+ @%p14 bra $L__BB0_18;
262
+ mul.f32 %f402, %f13, %f13;
263
+ mov.f32 %f401, 0f38B1E96A;
264
+ mov.f32 %f400, 0fBA574D20;
265
+ mov.f32 %f399, 0f3BAAD5EA;
266
+ mov.f32 %f398, 0fBCDC1BE7;
267
+ mov.f32 %f397, 0f3DE718AF;
268
+ mov.f32 %f396, 0fBEC093AC;
269
+ mov.f32 %f395, 0f3E0375D3;
270
+ $L__BB0_18:
271
+ .loc 1 0 0
272
+ cvt.f32.bf16 %r12, %rs7;
273
+ mul.f32 %f14, %f6, 0f3F3504F3;
274
+ .loc 1 30 23
275
+ setp.ltu.f32 %p15, %f69, 0f3F8060FE;
276
+ fma.rn.ftz.f32 %f235, %f401, %f402, %f400;
277
+ fma.rn.ftz.f32 %f236, %f235, %f402, %f399;
278
+ fma.rn.ftz.f32 %f237, %f236, %f402, %f398;
279
+ fma.rn.ftz.f32 %f238, %f237, %f402, %f397;
280
+ fma.rn.ftz.f32 %f239, %f238, %f402, %f396;
281
+ fma.rn.ftz.f32 %f240, %f239, %f402, %f395;
282
+ neg.f32 %f241, %f402;
283
+ selp.f32 %f242, %f241, %f13, %p14;
284
+ fma.rn.ftz.f32 %f403, %f240, %f242, %f242;
285
+ @%p15 bra $L__BB0_20;
286
+ ex2.approx.ftz.f32 %f243, %f403;
287
+ sub.f32 %f245, %f358, %f243;
288
+ mov.b32 %r35, %f245;
289
+ mov.b32 %r36, %f13;
290
+ and.b32 %r37, %r36, -2147483648;
291
+ or.b32 %r38, %r37, %r35;
292
+ mov.b32 %f403, %r38;
293
+ $L__BB0_20:
294
+ .loc 1 0 0
295
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
296
+ mov.b32 %f7, %r12;
297
+ .loc 1 30 23
298
+ abs.ftz.f32 %f82, %f14;
299
+ setp.ge.f32 %p17, %f82, 0f3F8060FE;
300
+ mov.f32 %f410, 0f3789CA3C;
301
+ mov.f32 %f409, 0fB9F560B9;
302
+ mov.f32 %f408, 0f3BAC840B;
303
+ mov.f32 %f407, 0fBD0C8162;
304
+ mov.f32 %f406, 0f3E1CF906;
305
+ mov.f32 %f405, 0f3F6A937E;
306
+ mov.f32 %f404, 0f3F20D842;
307
+ mov.f32 %f411, %f82;
308
+ @%p17 bra $L__BB0_22;
309
+ mul.f32 %f411, %f14, %f14;
310
+ mov.f32 %f410, 0f38B1E96A;
311
+ mov.f32 %f409, 0fBA574D20;
312
+ mov.f32 %f408, 0f3BAAD5EA;
313
+ mov.f32 %f407, 0fBCDC1BE7;
314
+ mov.f32 %f406, 0f3DE718AF;
315
+ mov.f32 %f405, 0fBEC093AC;
316
+ mov.f32 %f404, 0f3E0375D3;
317
+ $L__BB0_22:
318
+ .loc 1 0 0
319
+ cvt.f32.bf16 %r13, %rs8;
320
+ mul.f32 %f15, %f7, 0f3F3504F3;
321
+ .loc 1 30 23
322
+ setp.ltu.f32 %p18, %f82, 0f3F8060FE;
323
+ fma.rn.ftz.f32 %f260, %f410, %f411, %f409;
324
+ fma.rn.ftz.f32 %f261, %f260, %f411, %f408;
325
+ fma.rn.ftz.f32 %f262, %f261, %f411, %f407;
326
+ fma.rn.ftz.f32 %f263, %f262, %f411, %f406;
327
+ fma.rn.ftz.f32 %f264, %f263, %f411, %f405;
328
+ fma.rn.ftz.f32 %f265, %f264, %f411, %f404;
329
+ neg.f32 %f266, %f411;
330
+ selp.f32 %f267, %f266, %f14, %p17;
331
+ fma.rn.ftz.f32 %f412, %f265, %f267, %f267;
332
+ @%p18 bra $L__BB0_24;
333
+ ex2.approx.ftz.f32 %f268, %f412;
334
+ sub.f32 %f270, %f358, %f268;
335
+ mov.b32 %r39, %f270;
336
+ mov.b32 %r40, %f14;
337
+ and.b32 %r41, %r40, -2147483648;
338
+ or.b32 %r42, %r41, %r39;
339
+ mov.b32 %f412, %r42;
340
+ $L__BB0_24:
341
+ .loc 1 0 0
342
+ mov.b32 %f8, %r13;
343
+ .loc 1 30 23
344
+ abs.ftz.f32 %f95, %f15;
345
+ setp.ge.f32 %p20, %f95, 0f3F8060FE;
346
+ mov.f32 %f419, 0f3789CA3C;
347
+ mov.f32 %f418, 0fB9F560B9;
348
+ mov.f32 %f417, 0f3BAC840B;
349
+ mov.f32 %f416, 0fBD0C8162;
350
+ mov.f32 %f415, 0f3E1CF906;
351
+ mov.f32 %f414, 0f3F6A937E;
352
+ mov.f32 %f413, 0f3F20D842;
353
+ mov.f32 %f420, %f95;
354
+ @%p20 bra $L__BB0_26;
355
+ mul.f32 %f420, %f15, %f15;
356
+ mov.f32 %f419, 0f38B1E96A;
357
+ mov.f32 %f418, 0fBA574D20;
358
+ mov.f32 %f417, 0f3BAAD5EA;
359
+ mov.f32 %f416, 0fBCDC1BE7;
360
+ mov.f32 %f415, 0f3DE718AF;
361
+ mov.f32 %f414, 0fBEC093AC;
362
+ mov.f32 %f413, 0f3E0375D3;
363
+ $L__BB0_26:
364
+ .loc 1 0 0
365
+ mul.f32 %f16, %f8, 0f3F3504F3;
366
+ .loc 1 30 23
367
+ setp.ltu.f32 %p21, %f95, 0f3F8060FE;
368
+ fma.rn.ftz.f32 %f285, %f419, %f420, %f418;
369
+ fma.rn.ftz.f32 %f286, %f285, %f420, %f417;
370
+ fma.rn.ftz.f32 %f287, %f286, %f420, %f416;
371
+ fma.rn.ftz.f32 %f288, %f287, %f420, %f415;
372
+ fma.rn.ftz.f32 %f289, %f288, %f420, %f414;
373
+ fma.rn.ftz.f32 %f290, %f289, %f420, %f413;
374
+ neg.f32 %f291, %f420;
375
+ selp.f32 %f292, %f291, %f15, %p20;
376
+ fma.rn.ftz.f32 %f421, %f290, %f292, %f292;
377
+ @%p21 bra $L__BB0_28;
378
+ ex2.approx.ftz.f32 %f293, %f421;
379
+ sub.f32 %f295, %f358, %f293;
380
+ mov.b32 %r43, %f295;
381
+ mov.b32 %r44, %f15;
382
+ and.b32 %r45, %r44, -2147483648;
383
+ or.b32 %r46, %r45, %r43;
384
+ mov.b32 %f421, %r46;
385
+ $L__BB0_28:
386
+ abs.ftz.f32 %f108, %f16;
387
+ setp.ge.f32 %p23, %f108, 0f3F8060FE;
388
+ mov.f32 %f428, 0f3789CA3C;
389
+ mov.f32 %f427, 0fB9F560B9;
390
+ mov.f32 %f426, 0f3BAC840B;
391
+ mov.f32 %f425, 0fBD0C8162;
392
+ mov.f32 %f424, 0f3E1CF906;
393
+ mov.f32 %f423, 0f3F6A937E;
394
+ mov.f32 %f422, 0f3F20D842;
395
+ mov.f32 %f429, %f108;
396
+ @%p23 bra $L__BB0_30;
397
+ mul.f32 %f429, %f16, %f16;
398
+ mov.f32 %f428, 0f38B1E96A;
399
+ mov.f32 %f427, 0fBA574D20;
400
+ mov.f32 %f426, 0f3BAAD5EA;
401
+ mov.f32 %f425, 0fBCDC1BE7;
402
+ mov.f32 %f424, 0f3DE718AF;
403
+ mov.f32 %f423, 0fBEC093AC;
404
+ mov.f32 %f422, 0f3E0375D3;
405
+ $L__BB0_30:
406
+ setp.ltu.f32 %p24, %f108, 0f3F8060FE;
407
+ fma.rn.ftz.f32 %f310, %f428, %f429, %f427;
408
+ fma.rn.ftz.f32 %f311, %f310, %f429, %f426;
409
+ fma.rn.ftz.f32 %f312, %f311, %f429, %f425;
410
+ fma.rn.ftz.f32 %f313, %f312, %f429, %f424;
411
+ fma.rn.ftz.f32 %f314, %f313, %f429, %f423;
412
+ fma.rn.ftz.f32 %f315, %f314, %f429, %f422;
413
+ neg.f32 %f316, %f429;
414
+ selp.f32 %f317, %f316, %f16, %p23;
415
+ fma.rn.ftz.f32 %f430, %f315, %f317, %f317;
416
+ @%p24 bra $L__BB0_32;
417
+ ex2.approx.ftz.f32 %f318, %f430;
418
+ sub.f32 %f320, %f358, %f318;
419
+ mov.b32 %r47, %f320;
420
+ mov.b32 %r48, %f16;
421
+ and.b32 %r49, %r48, -2147483648;
422
+ or.b32 %r50, %r49, %r47;
423
+ mov.b32 %f430, %r50;
424
+ $L__BB0_32:
425
+ .loc 1 27 18
426
+ mul.f32 %f321, %f8, 0f3F000000;
427
+ mul.f32 %f322, %f7, 0f3F000000;
428
+ mul.f32 %f323, %f6, 0f3F000000;
429
+ mul.f32 %f324, %f5, 0f3F000000;
430
+ mul.f32 %f325, %f4, 0f3F000000;
431
+ mul.f32 %f326, %f3, 0f3F000000;
432
+ mul.f32 %f327, %f2, 0f3F000000;
433
+ mul.f32 %f328, %f1, 0f3F000000;
434
+ .loc 1 32 18
435
+ add.f32 %f329, %f367, 0f3F800000;
436
+ add.f32 %f330, %f376, 0f3F800000;
437
+ add.f32 %f331, %f385, 0f3F800000;
438
+ add.f32 %f332, %f394, 0f3F800000;
439
+ add.f32 %f333, %f403, 0f3F800000;
440
+ add.f32 %f334, %f412, 0f3F800000;
441
+ add.f32 %f335, %f421, 0f3F800000;
442
+ add.f32 %f336, %f430, 0f3F800000;
443
+ .loc 1 33 18
444
+ mul.f32 %f337, %f328, %f329;
445
+ mul.f32 %f338, %f327, %f330;
446
+ mul.f32 %f339, %f326, %f331;
447
+ mul.f32 %f340, %f325, %f332;
448
+ mul.f32 %f341, %f324, %f333;
449
+ mul.f32 %f342, %f323, %f334;
450
+ mul.f32 %f343, %f322, %f335;
451
+ mul.f32 %f344, %f321, %f336;
452
+ .loc 1 35 40
453
+ mov.b32 %r51, %f337;
454
+ cvt.rn.bf16.f32 %rs9, %r51;
455
+ mov.b32 %r52, %f338;
456
+ cvt.rn.bf16.f32 %rs10, %r52;
457
+ mov.b32 %r53, %f339;
458
+ cvt.rn.bf16.f32 %rs11, %r53;
459
+ mov.b32 %r54, %f340;
460
+ cvt.rn.bf16.f32 %rs12, %r54;
461
+ mov.b32 %r55, %f341;
462
+ cvt.rn.bf16.f32 %rs13, %r55;
463
+ mov.b32 %r56, %f342;
464
+ cvt.rn.bf16.f32 %rs14, %r56;
465
+ mov.b32 %r57, %f343;
466
+ cvt.rn.bf16.f32 %rs15, %r57;
467
+ mov.b32 %r58, %f344;
468
+ cvt.rn.bf16.f32 %rs16, %r58;
469
+ mov.b32 %r63, {%rs9, %rs10};
470
+ mov.b32 %r64, {%rs11, %rs12};
471
+ mov.b32 %r65, {%rs13, %rs14};
472
+ mov.b32 %r66, {%rs15, %rs16};
473
+ @%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r63, %r64, %r65, %r66 };
474
+ .loc 1 35 4
475
+ ret;
476
+ $L__tmp1:
477
+ $L__func_end0:
478
+
479
+ }
480
+ // .globl __nv_erff
481
+ .visible .func (.param .b32 func_retval0) __nv_erff(
482
+ .param .b32 __nv_erff_param_0
483
+ )
484
+ {
485
+ .reg .pred %p<4>;
486
+ .reg .b32 %r<5>;
487
+ .reg .f32 %f<49>;
488
+ $L__func_begin1:
489
+
490
+ ld.param.f32 %f14, [__nv_erff_param_0];
491
+ abs.ftz.f32 %f1, %f14;
492
+ setp.ge.f32 %p1, %f1, 0f3F8060FE;
493
+ mov.f32 %f46, 0f3789CA3C;
494
+ mov.f32 %f45, 0fB9F560B9;
495
+ mov.f32 %f44, 0f3BAC840B;
496
+ mov.f32 %f43, 0fBD0C8162;
497
+ mov.f32 %f42, 0f3E1CF906;
498
+ mov.f32 %f41, 0f3F6A937E;
499
+ mov.f32 %f40, 0f3F20D842;
500
+ mov.f32 %f47, %f1;
501
+ @%p1 bra $L__BB1_2;
502
+ mul.f32 %f47, %f14, %f14;
503
+ mov.f32 %f46, 0f38B1E96A;
504
+ mov.f32 %f45, 0fBA574D20;
505
+ mov.f32 %f44, 0f3BAAD5EA;
506
+ mov.f32 %f43, 0fBCDC1BE7;
507
+ mov.f32 %f42, 0f3DE718AF;
508
+ mov.f32 %f41, 0fBEC093AC;
509
+ mov.f32 %f40, 0f3E0375D3;
510
+ $L__BB1_2:
511
+ setp.ltu.f32 %p2, %f1, 0f3F8060FE;
512
+ fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
513
+ fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
514
+ fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
515
+ fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
516
+ fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
517
+ fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
518
+ neg.f32 %f35, %f47;
519
+ selp.f32 %f36, %f35, %f14, %p1;
520
+ fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
521
+ @%p2 bra $L__BB1_4;
522
+ ex2.approx.ftz.f32 %f37, %f48;
523
+ mov.f32 %f38, 0f3F800000;
524
+ sub.f32 %f39, %f38, %f37;
525
+ mov.b32 %r1, %f39;
526
+ mov.b32 %r2, %f14;
527
+ and.b32 %r3, %r2, -2147483648;
528
+ or.b32 %r4, %r3, %r1;
529
+ mov.b32 %f48, %r4;
530
+ $L__BB1_4:
531
+ st.param.f32 [func_retval0+0], %f48;
532
+ ret;
533
+ $L__func_end1:
534
+
535
+ }
536
+ .file 1 "/tmp/torchinductor_root/af/cafucwnmq4o436kwzkmrinerrnocxll7q6wsadcl726g6cradipo.py"
537
+ .section .debug_abbrev
538
+ {
539
+ .b8 1
540
+ .b8 17
541
+ .b8 1
542
+ .b8 37
543
+ .b8 8
544
+ .b8 19
545
+ .b8 5
546
+ .b8 3
547
+ .b8 8
548
+ .b8 16
549
+ .b8 6
550
+ .b8 27
551
+ .b8 8
552
+ .b8 180
553
+ .b8 66
554
+ .b8 12
555
+ .b8 17
556
+ .b8 1
557
+ .b8 18
558
+ .b8 1
559
+ .b8 0
560
+ .b8 0
561
+ .b8 2
562
+ .b8 46
563
+ .b8 0
564
+ .b8 17
565
+ .b8 1
566
+ .b8 18
567
+ .b8 1
568
+ .b8 64
569
+ .b8 10
570
+ .b8 135
571
+ .b8 64
572
+ .b8 8
573
+ .b8 3
574
+ .b8 8
575
+ .b8 58
576
+ .b8 11
577
+ .b8 59
578
+ .b8 11
579
+ .b8 63
580
+ .b8 12
581
+ .b8 0
582
+ .b8 0
583
+ .b8 0
584
+ }
585
+ .section .debug_info
586
+ {
587
+ .b32 172
588
+ .b8 2
589
+ .b8 0
590
+ .b32 .debug_abbrev
591
+ .b8 8
592
+ .b8 1
593
+ .b8 116
594
+ .b8 114
595
+ .b8 105
596
+ .b8 116
597
+ .b8 111
598
+ .b8 110
599
+ .b8 0
600
+ .b8 2
601
+ .b8 0
602
+ .b8 99
603
+ .b8 97
604
+ .b8 102
605
+ .b8 117
606
+ .b8 99
607
+ .b8 119
608
+ .b8 110
609
+ .b8 109
610
+ .b8 113
611
+ .b8 52
612
+ .b8 111
613
+ .b8 52
614
+ .b8 51
615
+ .b8 54
616
+ .b8 107
617
+ .b8 119
618
+ .b8 122
619
+ .b8 107
620
+ .b8 109
621
+ .b8 114
622
+ .b8 105
623
+ .b8 110
624
+ .b8 101
625
+ .b8 114
626
+ .b8 114
627
+ .b8 110
628
+ .b8 111
629
+ .b8 99
630
+ .b8 120
631
+ .b8 108
632
+ .b8 108
633
+ .b8 55
634
+ .b8 113
635
+ .b8 54
636
+ .b8 119
637
+ .b8 115
638
+ .b8 97
639
+ .b8 100
640
+ .b8 99
641
+ .b8 108
642
+ .b8 55
643
+ .b8 50
644
+ .b8 54
645
+ .b8 103
646
+ .b8 54
647
+ .b8 99
648
+ .b8 114
649
+ .b8 97
650
+ .b8 100
651
+ .b8 105
652
+ .b8 112
653
+ .b8 111
654
+ .b8 46
655
+ .b8 112
656
+ .b8 121
657
+ .b8 0
658
+ .b32 .debug_line
659
+ .b8 47
660
+ .b8 116
661
+ .b8 109
662
+ .b8 112
663
+ .b8 47
664
+ .b8 116
665
+ .b8 111
666
+ .b8 114
667
+ .b8 99
668
+ .b8 104
669
+ .b8 105
670
+ .b8 110
671
+ .b8 100
672
+ .b8 117
673
+ .b8 99
674
+ .b8 116
675
+ .b8 111
676
+ .b8 114
677
+ .b8 95
678
+ .b8 114
679
+ .b8 111
680
+ .b8 111
681
+ .b8 116
682
+ .b8 47
683
+ .b8 97
684
+ .b8 102
685
+ .b8 0
686
+ .b8 1
687
+ .b64 $L__func_begin0
688
+ .b64 $L__func_end0
689
+ .b8 2
690
+ .b64 $L__func_begin0
691
+ .b64 $L__func_end0
692
+ .b8 1
693
+ .b8 156
694
+ .b8 116
695
+ .b8 114
696
+ .b8 105
697
+ .b8 116
698
+ .b8 111
699
+ .b8 110
700
+ .b8 95
701
+ .b8 95
702
+ .b8 48
703
+ .b8 100
704
+ .b8 49
705
+ .b8 100
706
+ .b8 101
707
+ .b8 0
708
+ .b8 116
709
+ .b8 114
710
+ .b8 105
711
+ .b8 116
712
+ .b8 111
713
+ .b8 110
714
+ .b8 95
715
+ .b8 95
716
+ .b8 48
717
+ .b8 100
718
+ .b8 49
719
+ .b8 100
720
+ .b8 101
721
+ .b8 0
722
+ .b8 1
723
+ .b8 18
724
+ .b8 1
725
+ .b8 0
726
+ }
727
+ .section .debug_pubnames
728
+ {
729
+ .b32 $L__pubNames_end0-$L__pubNames_start0
730
+ $L__pubNames_start0:
731
+ .b8 2
732
+ .b8 0
733
+ .b32 .debug_info
734
+ .b32 176
735
+ .b32 125
736
+ .b8 116
737
+ .b8 114
738
+ .b8 105
739
+ .b8 116
740
+ .b8 111
741
+ .b8 110
742
+ .b8 95
743
+ .b8 95
744
+ .b8 48
745
+ .b8 100
746
+ .b8 49
747
+ .b8 100
748
+ .b8 101
749
+ .b8 0
750
+ .b32 0
751
+ $L__pubNames_end0:
752
+ }
753
+ .section .debug_pubtypes
754
+ {
755
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
756
+ $L__pubTypes_start0:
757
+ .b8 2
758
+ .b8 0
759
+ .b32 .debug_info
760
+ .b32 176
761
+ .b32 0
762
+ $L__pubTypes_end0:
763
+ }
764
+ .section .debug_loc { }
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked>
5
+ %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked>
6
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked>
7
+ %c1024_i32 = arith.constant 1024 : i32
8
+ %0 = tt.get_program_id x : i32
9
+ %1 = arith.muli %0, %c1024_i32 : i32
10
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
11
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
12
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
13
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
14
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
15
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
16
+ %8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
17
+ %9 = arith.mulf %8, %cst_1 : tensor<1024xf32, #blocked>
18
+ %10 = arith.mulf %8, %cst_0 : tensor<1024xf32, #blocked>
19
+ %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
20
+ %12 = arith.addf %11, %cst : tensor<1024xf32, #blocked>
21
+ %13 = arith.mulf %9, %12 : tensor<1024xf32, #blocked>
22
+ %14 = arith.truncf %13 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
23
+ tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
24
+ tt.return
25
+ }
26
+ }
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32>
4
+ %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32>
5
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
6
+ %c1024_i32 = arith.constant 1024 : i32
7
+ %0 = tt.get_program_id x : i32
8
+ %1 = arith.muli %0, %c1024_i32 : i32
9
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
10
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
11
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
12
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
13
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
14
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
15
+ %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
16
+ %9 = arith.mulf %8, %cst_1 : tensor<1024xf32>
17
+ %10 = arith.mulf %8, %cst_0 : tensor<1024xf32>
18
+ %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
19
+ %12 = arith.addf %11, %cst : tensor<1024xf32>
20
+ %13 = arith.mulf %9, %12 : tensor<1024xf32>
21
+ %14 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16>
22
+ tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
23
+ tt.return
24
+ }
25
+ }
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin ADDED
Binary file (14.6 kB). View file
 
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 31, !dbg !8
9
+ %8 = lshr i32 %6, 5, !dbg !8
10
+ %9 = and i32 %6, 7, !dbg !8
11
+ %10 = shl nuw nsw i32 %9, 2, !dbg !8
12
+ %11 = and i32 %8, 7, !dbg !9
13
+ %12 = lshr i32 %7, 3, !dbg !9
14
+ %13 = shl nuw nsw i32 %11, 2, !dbg !9
15
+ %14 = or i32 %13, %12, !dbg !9
16
+ %15 = or i32 %14, 96, !dbg !9
17
+ %16 = or i32 %10, 1, !dbg !10
18
+ %17 = or i32 %10, 2, !dbg !10
19
+ %18 = or i32 %10, 3, !dbg !10
20
+ %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !14
21
+ %20 = shl i32 %19, 5, !dbg !15
22
+ %21 = or i32 %20, %10, !dbg !16
23
+ %22 = or i32 %20, %7, !dbg !16
24
+ %23 = icmp ult i32 %15, 120, !dbg !17
25
+ %24 = shl nuw nsw i32 %14, 17, !dbg !18
26
+ %25 = or i32 %24, 4194304, !dbg !18
27
+ %26 = or i32 %24, 8388608, !dbg !18
28
+ %27 = shl nuw nsw i32 %15, 17, !dbg !18
29
+ %28 = add i32 %21, %24, !dbg !19
30
+ %29 = add i32 %25, %21, !dbg !19
31
+ %30 = add i32 %26, %21, !dbg !19
32
+ %31 = add i32 %21, %27, !dbg !19
33
+ %32 = sext i32 %28 to i64, !dbg !20
34
+ %33 = getelementptr float, ptr addrspace(1) %0, i64 %32, !dbg !20
35
+ %34 = sext i32 %29 to i64, !dbg !20
36
+ %35 = getelementptr float, ptr addrspace(1) %0, i64 %34, !dbg !20
37
+ %36 = sext i32 %30 to i64, !dbg !20
38
+ %37 = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !20
39
+ %38 = sext i32 %31 to i64, !dbg !20
40
+ %39 = getelementptr float, ptr addrspace(1) %0, i64 %38, !dbg !20
41
+ %40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %33, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
42
+ %41 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !21
43
+ %42 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !21
44
+ %43 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !21
45
+ %44 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !21
46
+ %45 = bitcast i32 %41 to float, !dbg !21
47
+ %46 = bitcast i32 %42 to float, !dbg !21
48
+ %47 = bitcast i32 %43 to float, !dbg !21
49
+ %48 = bitcast i32 %44 to float, !dbg !21
50
+ %49 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
51
+ %50 = extractvalue { i32, i32, i32, i32 } %49, 0, !dbg !21
52
+ %51 = extractvalue { i32, i32, i32, i32 } %49, 1, !dbg !21
53
+ %52 = extractvalue { i32, i32, i32, i32 } %49, 2, !dbg !21
54
+ %53 = extractvalue { i32, i32, i32, i32 } %49, 3, !dbg !21
55
+ %54 = bitcast i32 %50 to float, !dbg !21
56
+ %55 = bitcast i32 %51 to float, !dbg !21
57
+ %56 = bitcast i32 %52 to float, !dbg !21
58
+ %57 = bitcast i32 %53 to float, !dbg !21
59
+ %58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
60
+ %59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !21
61
+ %60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !21
62
+ %61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !21
63
+ %62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !21
64
+ %63 = bitcast i32 %59 to float, !dbg !21
65
+ %64 = bitcast i32 %60 to float, !dbg !21
66
+ %65 = bitcast i32 %61 to float, !dbg !21
67
+ %66 = bitcast i32 %62 to float, !dbg !21
68
+ %67 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23) #3, !dbg !21
69
+ %68 = extractvalue { i32, i32, i32, i32 } %67, 0, !dbg !21
70
+ %69 = extractvalue { i32, i32, i32, i32 } %67, 1, !dbg !21
71
+ %70 = extractvalue { i32, i32, i32, i32 } %67, 2, !dbg !21
72
+ %71 = extractvalue { i32, i32, i32, i32 } %67, 3, !dbg !21
73
+ %72 = bitcast i32 %68 to float, !dbg !21
74
+ %73 = bitcast i32 %69 to float, !dbg !21
75
+ %74 = bitcast i32 %70 to float, !dbg !21
76
+ %75 = bitcast i32 %71 to float, !dbg !21
77
+ %76 = fadd float %45, 0.000000e+00, !dbg !22
78
+ %77 = fadd float %46, 0.000000e+00, !dbg !22
79
+ %78 = fadd float %47, 0.000000e+00, !dbg !22
80
+ %79 = fadd float %48, 0.000000e+00, !dbg !22
81
+ %80 = fadd float %54, 0.000000e+00, !dbg !22
82
+ %81 = fadd float %55, 0.000000e+00, !dbg !22
83
+ %82 = fadd float %56, 0.000000e+00, !dbg !22
84
+ %83 = fadd float %57, 0.000000e+00, !dbg !22
85
+ %84 = fadd float %63, 0.000000e+00, !dbg !22
86
+ %85 = fadd float %64, 0.000000e+00, !dbg !22
87
+ %86 = fadd float %65, 0.000000e+00, !dbg !22
88
+ %87 = fadd float %66, 0.000000e+00, !dbg !22
89
+ %88 = fadd float %72, 0.000000e+00, !dbg !22
90
+ %89 = fadd float %73, 0.000000e+00, !dbg !22
91
+ %90 = fadd float %74, 0.000000e+00, !dbg !22
92
+ %91 = fadd float %75, 0.000000e+00, !dbg !22
93
+ %92 = select i1 %23, float %88, float 0.000000e+00, !dbg !23
94
+ %93 = select i1 %23, float %89, float 0.000000e+00, !dbg !23
95
+ %94 = select i1 %23, float %90, float 0.000000e+00, !dbg !23
96
+ %95 = select i1 %23, float %91, float 0.000000e+00, !dbg !23
97
+ %96 = fadd float %76, %80, !dbg !24
98
+ %97 = fadd float %77, %81, !dbg !24
99
+ %98 = fadd float %78, %82, !dbg !24
100
+ %99 = fadd float %79, %83, !dbg !24
101
+ %100 = fadd float %96, %84, !dbg !24
102
+ %101 = fadd float %97, %85, !dbg !24
103
+ %102 = fadd float %98, %86, !dbg !24
104
+ %103 = fadd float %99, %87, !dbg !24
105
+ %104 = fadd float %100, %92, !dbg !24
106
+ %105 = fadd float %101, %93, !dbg !24
107
+ %106 = fadd float %102, %94, !dbg !24
108
+ %107 = fadd float %103, %95, !dbg !24
109
+ %108 = bitcast float %104 to i32, !dbg !10
110
+ %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !10
111
+ %110 = bitcast i32 %109 to float, !dbg !10
112
+ %111 = fadd float %104, %110, !dbg !24
113
+ %112 = bitcast float %111 to i32, !dbg !10
114
+ %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !10
115
+ %114 = bitcast i32 %113 to float, !dbg !10
116
+ %115 = fadd float %111, %114, !dbg !24
117
+ %116 = bitcast float %105 to i32, !dbg !10
118
+ %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !10
119
+ %118 = bitcast i32 %117 to float, !dbg !10
120
+ %119 = fadd float %105, %118, !dbg !24
121
+ %120 = bitcast float %119 to i32, !dbg !10
122
+ %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 8, i32 31), !dbg !10
123
+ %122 = bitcast i32 %121 to float, !dbg !10
124
+ %123 = fadd float %119, %122, !dbg !24
125
+ %124 = bitcast float %106 to i32, !dbg !10
126
+ %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 16, i32 31), !dbg !10
127
+ %126 = bitcast i32 %125 to float, !dbg !10
128
+ %127 = fadd float %106, %126, !dbg !24
129
+ %128 = bitcast float %127 to i32, !dbg !10
130
+ %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !10
131
+ %130 = bitcast i32 %129 to float, !dbg !10
132
+ %131 = fadd float %127, %130, !dbg !24
133
+ %132 = bitcast float %107 to i32, !dbg !10
134
+ %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !10
135
+ %134 = bitcast i32 %133 to float, !dbg !10
136
+ %135 = fadd float %107, %134, !dbg !24
137
+ %136 = bitcast float %135 to i32, !dbg !10
138
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !10
139
+ %138 = bitcast i32 %137 to float, !dbg !10
140
+ %139 = fadd float %135, %138, !dbg !24
141
+ %140 = icmp ult i32 %7, 8, !dbg !10
142
+ %141 = shl nuw nsw i32 %9, 5, !dbg !10
143
+ %142 = or i32 %141, %11, !dbg !10
144
+ %143 = zext nneg i32 %142 to i64, !dbg !10
145
+ %144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !10
146
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %144, float %115, i1 %140) #3, !dbg !10
147
+ %145 = shl nuw nsw i32 %16, 3, !dbg !10
148
+ %146 = or i32 %145, %11, !dbg !10
149
+ %147 = zext nneg i32 %146 to i64, !dbg !10
150
+ %148 = getelementptr float, ptr addrspace(3) @global_smem, i64 %147, !dbg !10
151
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %148, float %123, i1 %140) #3, !dbg !10
152
+ %149 = shl nuw nsw i32 %17, 3, !dbg !10
153
+ %150 = or i32 %149, %11, !dbg !10
154
+ %151 = zext nneg i32 %150 to i64, !dbg !10
155
+ %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !10
156
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %152, float %131, i1 %140) #3, !dbg !10
157
+ %153 = shl nuw nsw i32 %18, 3, !dbg !10
158
+ %154 = or i32 %153, %11, !dbg !10
159
+ %155 = zext nneg i32 %154 to i64, !dbg !10
160
+ %156 = getelementptr float, ptr addrspace(3) @global_smem, i64 %155, !dbg !10
161
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %156, float %139, i1 %140) #3, !dbg !10
162
+ tail call void @llvm.nvvm.barrier0(), !dbg !10
163
+ %157 = icmp slt i32 %6, 256, !dbg !10
164
+ %158 = sext i32 %6 to i64, !dbg !10
165
+ %159 = getelementptr float, ptr addrspace(3) @global_smem, i64 %158, !dbg !10
166
+ %160 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %159, i1 %157) #3, !dbg !10
167
+ %161 = bitcast float %160 to i32, !dbg !10
168
+ %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !10
169
+ %163 = bitcast i32 %162 to float, !dbg !10
170
+ %164 = fadd float %160, %163, !dbg !24
171
+ %165 = bitcast float %164 to i32, !dbg !10
172
+ %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !10
173
+ %167 = bitcast i32 %166 to float, !dbg !10
174
+ %168 = fadd float %164, %167, !dbg !24
175
+ %169 = bitcast float %168 to i32, !dbg !10
176
+ %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 1, i32 31), !dbg !10
177
+ %171 = bitcast i32 %170 to float, !dbg !10
178
+ %172 = fadd float %168, %171, !dbg !24
179
+ %173 = icmp eq i32 %9, 0, !dbg !10
180
+ %174 = and i1 %157, %173, !dbg !10
181
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %159, float %172, i1 %174) #3, !dbg !10
182
+ tail call void @llvm.nvvm.barrier0(), !dbg !10
183
+ %175 = zext nneg i32 %141 to i64, !dbg !10
184
+ %176 = getelementptr float, ptr addrspace(3) @global_smem, i64 %175, !dbg !10
185
+ %177 = load float, ptr addrspace(3) %176, align 4, !dbg !10
186
+ %178 = zext nneg i32 %145 to i64, !dbg !10
187
+ %179 = getelementptr float, ptr addrspace(3) @global_smem, i64 %178, !dbg !10
188
+ %180 = load float, ptr addrspace(3) %179, align 4, !dbg !10
189
+ %181 = zext nneg i32 %149 to i64, !dbg !10
190
+ %182 = getelementptr float, ptr addrspace(3) @global_smem, i64 %181, !dbg !10
191
+ %183 = load float, ptr addrspace(3) %182, align 4, !dbg !10
192
+ %184 = zext nneg i32 %153 to i64, !dbg !10
193
+ %185 = getelementptr float, ptr addrspace(3) @global_smem, i64 %184, !dbg !10
194
+ %186 = load float, ptr addrspace(3) %185, align 4, !dbg !10
195
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
196
+ %187 = zext nneg i32 %10 to i64, !dbg !28
197
+ %188 = getelementptr float, ptr addrspace(3) @global_smem, i64 %187, !dbg !28
198
+ %189 = insertelement <1 x float> undef, float %177, i64 0, !dbg !28
199
+ store <1 x float> %189, ptr addrspace(3) %188, align 4, !dbg !28
200
+ %190 = zext nneg i32 %16 to i64, !dbg !28
201
+ %191 = getelementptr float, ptr addrspace(3) @global_smem, i64 %190, !dbg !28
202
+ %192 = insertelement <1 x float> undef, float %180, i64 0, !dbg !28
203
+ store <1 x float> %192, ptr addrspace(3) %191, align 4, !dbg !28
204
+ %193 = zext nneg i32 %17 to i64, !dbg !28
205
+ %194 = getelementptr float, ptr addrspace(3) @global_smem, i64 %193, !dbg !28
206
+ %195 = insertelement <1 x float> undef, float %183, i64 0, !dbg !28
207
+ store <1 x float> %195, ptr addrspace(3) %194, align 4, !dbg !28
208
+ %196 = zext nneg i32 %18 to i64, !dbg !28
209
+ %197 = getelementptr float, ptr addrspace(3) @global_smem, i64 %196, !dbg !28
210
+ %198 = insertelement <1 x float> undef, float %186, i64 0, !dbg !28
211
+ store <1 x float> %198, ptr addrspace(3) %197, align 4, !dbg !28
212
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
213
+ %199 = zext nneg i32 %7 to i64, !dbg !28
214
+ %200 = getelementptr float, ptr addrspace(3) @global_smem, i64 %199, !dbg !28
215
+ %201 = load <1 x float>, ptr addrspace(3) %200, align 4, !dbg !28
216
+ %.frozen = freeze i32 %22
217
+ %202 = sdiv i32 %.frozen, 256, !dbg !29
218
+ %203 = mul i32 %202, 256
219
+ %.decomposed = sub i32 %.frozen, %203
220
+ %204 = sext i32 %202 to i64, !dbg !30
221
+ %205 = getelementptr i64, ptr addrspace(1) %1, i64 %204, !dbg !30
222
+ %206 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %205, i1 true) #3, !dbg !31
223
+ %207 = lshr i64 %206, 54, !dbg !32
224
+ %208 = and i64 %207, 512, !dbg !32
225
+ %209 = add i64 %208, %206, !dbg !32
226
+ %210 = shl i64 %209, 8, !dbg !33
227
+ %211 = sext i32 %.decomposed to i64, !dbg !34
228
+ %212 = getelementptr float, ptr addrspace(1) %2, i64 %210, !dbg !35
229
+ %213 = getelementptr float, ptr addrspace(1) %212, i64 %211, !dbg !35
230
+ %214 = icmp eq i32 %11, 0, !dbg !36
231
+ %215 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %213, <1 x float> %201, i1 %214) #3, !dbg !36
232
+ ret void, !dbg !37
233
+ }
234
+
235
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
236
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
237
+
238
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
239
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
240
+
241
+ ; Function Attrs: convergent nocallback nounwind
242
+ declare void @llvm.nvvm.barrier0() #2
243
+
244
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
245
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
246
+ attributes #2 = { convergent nocallback nounwind }
247
+ attributes #3 = { nounwind }
248
+
249
+ !llvm.module.flags = !{!0}
250
+ !llvm.dbg.cu = !{!1}
251
+ !nvvm.annotations = !{!3, !4, !4, !3}
252
+
253
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
254
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
255
+ !2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
256
+ !3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
257
+ !4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
258
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
259
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
260
+ !7 = !{}
261
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
262
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
263
+ !10 = !DILocation(line: 243, column: 36, scope: !11, inlinedAt: !13)
264
+ !11 = distinct !DILexicalBlockFile(scope: !5, file: !12, discriminator: 0)
265
+ !12 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
266
+ !13 = !DILocation(line: 35, column: 25, scope: !11)
267
+ !14 = !DILocation(line: 21, column: 28, scope: !5)
268
+ !15 = !DILocation(line: 21, column: 33, scope: !5)
269
+ !16 = !DILocation(line: 22, column: 23, scope: !5)
270
+ !17 = !DILocation(line: 29, column: 25, scope: !5)
271
+ !18 = !DILocation(line: 31, column: 47, scope: !5)
272
+ !19 = !DILocation(line: 31, column: 40, scope: !5)
273
+ !20 = !DILocation(line: 31, column: 34, scope: !5)
274
+ !21 = !DILocation(line: 31, column: 53, scope: !5)
275
+ !22 = !DILocation(line: 33, column: 23, scope: !5)
276
+ !23 = !DILocation(line: 34, column: 38, scope: !5)
277
+ !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
278
+ !25 = distinct !DILexicalBlockFile(scope: !11, file: !12, discriminator: 0)
279
+ !26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
280
+ !27 = !DILocation(line: 35, column: 25, scope: !25)
281
+ !28 = !DILocation(line: 35, column: 28, scope: !5)
282
+ !29 = !DILocation(line: 36, column: 20, scope: !5)
283
+ !30 = !DILocation(line: 38, column: 30, scope: !5)
284
+ !31 = !DILocation(line: 38, column: 35, scope: !5)
285
+ !32 = !DILocation(line: 41, column: 32, scope: !5)
286
+ !33 = !DILocation(line: 45, column: 40, scope: !5)
287
+ !34 = !DILocation(line: 45, column: 36, scope: !5)
288
+ !35 = !DILocation(line: 45, column: 30, scope: !5)
289
+ !36 = !DILocation(line: 45, column: 55, scope: !5)
290
+ !37 = !DILocation(line: 45, column: 4, scope: !5)
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4e
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4e(
13
+ .param .u64 triton__0d1d2d3de4e_param_0,
14
+ .param .u64 triton__0d1d2d3de4e_param_1,
15
+ .param .u64 triton__0d1d2d3de4e_param_2,
16
+ .param .u32 triton__0d1d2d3de4e_param_3,
17
+ .param .u32 triton__0d1d2d3de4e_param_4
18
+ )
19
+ .maxntid 256, 1, 1
20
+ {
21
+ .reg .pred %p<30>;
22
+ .reg .b32 %r<112>;
23
+ .reg .f32 %f<76>;
24
+ .reg .b64 %rd<22>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd8, [triton__0d1d2d3de4e_param_0];
30
+ ld.param.u64 %rd9, [triton__0d1d2d3de4e_param_1];
31
+ $L__tmp0:
32
+ .loc 1 22 44
33
+ mov.u32 %r48, %tid.x;
34
+ and.b32 %r49, %r48, 31;
35
+ ld.param.u64 %rd10, [triton__0d1d2d3de4e_param_2];
36
+ and.b32 %r50, %r48, 7;
37
+ shl.b32 %r51, %r50, 2;
38
+ .loc 1 24 33
39
+ bfe.u32 %r52, %r48, 5, 3;
40
+ bfe.u32 %r53, %r48, 3, 2;
41
+ shl.b32 %r54, %r52, 2;
42
+ or.b32 %r55, %r54, %r53;
43
+ or.b32 %r56, %r55, 96;
44
+ .loc 1 21 28
45
+ mov.u32 %r1, %ctaid.x;
46
+ .loc 1 21 33
47
+ shl.b32 %r57, %r1, 5;
48
+ .loc 1 22 23
49
+ or.b32 %r58, %r57, %r51;
50
+ or.b32 %r59, %r57, %r49;
51
+ .loc 1 29 25
52
+ setp.lt.u32 %p16, %r56, 120;
53
+ .loc 1 31 47
54
+ shl.b32 %r60, %r55, 17;
55
+ shl.b32 %r61, %r56, 17;
56
+ .loc 1 31 40
57
+ add.s32 %r62, %r58, %r60;
58
+ add.s32 %r63, %r62, 4194304;
59
+ add.s32 %r64, %r62, 8388608;
60
+ add.s32 %r65, %r58, %r61;
61
+ .loc 1 31 34
62
+ mul.wide.s32 %rd11, %r62, 4;
63
+ add.s64 %rd1, %rd8, %rd11;
64
+ mul.wide.s32 %rd12, %r63, 4;
65
+ add.s64 %rd2, %rd8, %rd12;
66
+ mul.wide.s32 %rd13, %r64, 4;
67
+ add.s64 %rd3, %rd8, %rd13;
68
+ mul.wide.s32 %rd14, %r65, 4;
69
+ add.s64 %rd4, %rd8, %rd14;
70
+ mov.b32 %r6, 0;
71
+ mov.pred %p1, -1;
72
+ .loc 1 31 53
73
+ mov.u32 %r2, 0x0;
74
+ mov.u32 %r3, 0x0;
75
+ mov.u32 %r4, 0x0;
76
+ mov.u32 %r5, 0x0;
77
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
78
+ @!%p1 mov.u32 %r2, %r6;
79
+ @!%p1 mov.u32 %r3, %r6;
80
+ @!%p1 mov.u32 %r4, %r6;
81
+ @!%p1 mov.u32 %r5, %r6;
82
+ mov.b32 %f1, %r2;
83
+ mov.b32 %f2, %r3;
84
+ mov.b32 %f3, %r4;
85
+ mov.b32 %f4, %r5;
86
+ mov.u32 %r10, 0x0;
87
+ mov.u32 %r11, 0x0;
88
+ mov.u32 %r12, 0x0;
89
+ mov.u32 %r13, 0x0;
90
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
91
+ @!%p1 mov.u32 %r10, %r6;
92
+ @!%p1 mov.u32 %r11, %r6;
93
+ @!%p1 mov.u32 %r12, %r6;
94
+ @!%p1 mov.u32 %r13, %r6;
95
+ mov.b32 %f5, %r10;
96
+ mov.b32 %f6, %r11;
97
+ mov.b32 %f7, %r12;
98
+ mov.b32 %f8, %r13;
99
+ mov.u32 %r18, 0x0;
100
+ mov.u32 %r19, 0x0;
101
+ mov.u32 %r20, 0x0;
102
+ mov.u32 %r21, 0x0;
103
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
104
+ @!%p1 mov.u32 %r18, %r6;
105
+ @!%p1 mov.u32 %r19, %r6;
106
+ @!%p1 mov.u32 %r20, %r6;
107
+ @!%p1 mov.u32 %r21, %r6;
108
+ mov.b32 %f9, %r18;
109
+ mov.b32 %f10, %r19;
110
+ mov.b32 %f11, %r20;
111
+ mov.b32 %f12, %r21;
112
+ mov.u32 %r26, 0x0;
113
+ mov.u32 %r27, 0x0;
114
+ mov.u32 %r28, 0x0;
115
+ mov.u32 %r29, 0x0;
116
+ @%p16 ld.global.L1::evict_first.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
117
+ @!%p16 mov.u32 %r26, %r6;
118
+ @!%p16 mov.u32 %r27, %r6;
119
+ @!%p16 mov.u32 %r28, %r6;
120
+ @!%p16 mov.u32 %r29, %r6;
121
+ mov.b32 %f13, %r26;
122
+ mov.b32 %f14, %r27;
123
+ mov.b32 %f15, %r28;
124
+ mov.b32 %f16, %r29;
125
+ .loc 1 33 23
126
+ add.f32 %f17, %f1, 0f00000000;
127
+ add.f32 %f18, %f2, 0f00000000;
128
+ add.f32 %f19, %f3, 0f00000000;
129
+ add.f32 %f20, %f4, 0f00000000;
130
+ add.f32 %f21, %f5, 0f00000000;
131
+ add.f32 %f22, %f6, 0f00000000;
132
+ add.f32 %f23, %f7, 0f00000000;
133
+ add.f32 %f24, %f8, 0f00000000;
134
+ add.f32 %f25, %f9, 0f00000000;
135
+ add.f32 %f26, %f10, 0f00000000;
136
+ add.f32 %f27, %f11, 0f00000000;
137
+ add.f32 %f28, %f12, 0f00000000;
138
+ add.f32 %f29, %f13, 0f00000000;
139
+ add.f32 %f30, %f14, 0f00000000;
140
+ add.f32 %f31, %f15, 0f00000000;
141
+ add.f32 %f32, %f16, 0f00000000;
142
+ .loc 1 34 38
143
+ selp.f32 %f33, %f29, 0f00000000, %p16;
144
+ selp.f32 %f34, %f30, 0f00000000, %p16;
145
+ selp.f32 %f35, %f31, 0f00000000, %p16;
146
+ selp.f32 %f36, %f32, 0f00000000, %p16;
147
+ $L__tmp1:
148
+ .loc 2 233 15
149
+ add.f32 %f37, %f17, %f21;
150
+ add.f32 %f38, %f18, %f22;
151
+ add.f32 %f39, %f19, %f23;
152
+ add.f32 %f40, %f20, %f24;
153
+ add.f32 %f41, %f37, %f25;
154
+ add.f32 %f42, %f38, %f26;
155
+ add.f32 %f43, %f39, %f27;
156
+ add.f32 %f44, %f40, %f28;
157
+ add.f32 %f45, %f41, %f33;
158
+ add.f32 %f46, %f42, %f34;
159
+ add.f32 %f47, %f43, %f35;
160
+ add.f32 %f48, %f44, %f36;
161
+ $L__tmp2:
162
+ .loc 2 243 36
163
+ mov.b32 %r66, %f45;
164
+ shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
165
+ mov.b32 %f49, %r67;
166
+ $L__tmp3:
167
+ .loc 2 233 15
168
+ add.f32 %f50, %f45, %f49;
169
+ $L__tmp4:
170
+ .loc 2 243 36
171
+ mov.b32 %r68, %f50;
172
+ shfl.sync.bfly.b32 %r69, %r68, 8, 31, -1;
173
+ mov.b32 %f51, %r69;
174
+ $L__tmp5:
175
+ .loc 2 233 15
176
+ add.f32 %f52, %f50, %f51;
177
+ $L__tmp6:
178
+ .loc 2 243 36
179
+ mov.b32 %r70, %f46;
180
+ shfl.sync.bfly.b32 %r71, %r70, 16, 31, -1;
181
+ mov.b32 %f53, %r71;
182
+ $L__tmp7:
183
+ .loc 2 233 15
184
+ add.f32 %f54, %f46, %f53;
185
+ $L__tmp8:
186
+ .loc 2 243 36
187
+ mov.b32 %r72, %f54;
188
+ shfl.sync.bfly.b32 %r73, %r72, 8, 31, -1;
189
+ mov.b32 %f55, %r73;
190
+ $L__tmp9:
191
+ .loc 2 233 15
192
+ add.f32 %f56, %f54, %f55;
193
+ $L__tmp10:
194
+ .loc 2 243 36
195
+ mov.b32 %r74, %f47;
196
+ shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1;
197
+ mov.b32 %f57, %r75;
198
+ $L__tmp11:
199
+ .loc 2 233 15
200
+ add.f32 %f58, %f47, %f57;
201
+ $L__tmp12:
202
+ .loc 2 243 36
203
+ mov.b32 %r76, %f58;
204
+ shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1;
205
+ mov.b32 %f59, %r77;
206
+ $L__tmp13:
207
+ .loc 2 233 15
208
+ add.f32 %f60, %f58, %f59;
209
+ $L__tmp14:
210
+ .loc 2 243 36
211
+ mov.b32 %r78, %f48;
212
+ shfl.sync.bfly.b32 %r79, %r78, 16, 31, -1;
213
+ mov.b32 %f61, %r79;
214
+ $L__tmp15:
215
+ .loc 2 233 15
216
+ add.f32 %f62, %f48, %f61;
217
+ $L__tmp16:
218
+ .loc 2 243 36
219
+ mov.b32 %r80, %f62;
220
+ shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1;
221
+ mov.b32 %f63, %r81;
222
+ $L__tmp17:
223
+ .loc 2 233 15
224
+ add.f32 %f64, %f62, %f63;
225
+ $L__tmp18:
226
+ .loc 2 243 36
227
+ setp.lt.u32 %p21, %r49, 8;
228
+ shl.b32 %r82, %r50, 7;
229
+ or.b32 %r83, %r82, %r54;
230
+ mov.u32 %r84, global_smem;
231
+ add.s32 %r34, %r84, %r83;
232
+ mov.b32 %r35, %f52;
233
+ @%p21 st.shared.b32 [ %r34 + 0 ], %r35;
234
+ or.b32 %r85, %r82, 32;
235
+ or.b32 %r86, %r85, %r54;
236
+ add.s32 %r36, %r84, %r86;
237
+ mov.b32 %r37, %f56;
238
+ @%p21 st.shared.b32 [ %r36 + 0 ], %r37;
239
+ or.b32 %r87, %r82, 64;
240
+ or.b32 %r88, %r87, %r54;
241
+ add.s32 %r38, %r84, %r88;
242
+ mov.b32 %r39, %f60;
243
+ @%p21 st.shared.b32 [ %r38 + 0 ], %r39;
244
+ or.b32 %r89, %r82, 96;
245
+ or.b32 %r90, %r89, %r54;
246
+ add.s32 %r40, %r84, %r90;
247
+ mov.b32 %r41, %f64;
248
+ @%p21 st.shared.b32 [ %r40 + 0 ], %r41;
249
+ bar.sync 0;
250
+ setp.lt.s32 %p25, %r48, 256;
251
+ shl.b32 %r91, %r48, 2;
252
+ add.s32 %r43, %r84, %r91;
253
+ @%p25 ld.shared.b32 %r42, [ %r43 + 0 ];
254
+ mov.b32 %f65, %r42;
255
+ shfl.sync.bfly.b32 %r92, %r42, 4, 31, -1;
256
+ mov.b32 %f66, %r92;
257
+ $L__tmp19:
258
+ .loc 2 233 15
259
+ add.f32 %f67, %f65, %f66;
260
+ $L__tmp20:
261
+ .loc 2 243 36
262
+ mov.b32 %r93, %f67;
263
+ shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
264
+ mov.b32 %f68, %r94;
265
+ $L__tmp21:
266
+ .loc 2 233 15
267
+ add.f32 %f69, %f67, %f68;
268
+ $L__tmp22:
269
+ .loc 2 243 36
270
+ mov.b32 %r95, %f69;
271
+ shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
272
+ mov.b32 %f70, %r96;
273
+ $L__tmp23:
274
+ .loc 2 233 15
275
+ add.f32 %f71, %f69, %f70;
276
+ $L__tmp24:
277
+ .loc 2 243 36
278
+ setp.eq.s32 %p29, %r50, 0;
279
+ and.pred %p26, %p25, %p29;
280
+ mov.b32 %r45, %f71;
281
+ @%p26 st.shared.b32 [ %r43 + 0 ], %r45;
282
+ bar.sync 0;
283
+ add.s32 %r97, %r84, %r82;
284
+ ld.shared.f32 %f72, [%r97];
285
+ add.s32 %r98, %r84, %r85;
286
+ ld.shared.f32 %f73, [%r98];
287
+ add.s32 %r99, %r84, %r87;
288
+ ld.shared.f32 %f74, [%r99];
289
+ add.s32 %r100, %r84, %r89;
290
+ ld.shared.f32 %f75, [%r100];
291
+ $L__tmp25:
292
+ .loc 1 35 28
293
+ bar.sync 0;
294
+ shl.b32 %r101, %r50, 4;
295
+ add.s32 %r102, %r84, %r101;
296
+ st.shared.f32 [%r102], %f72;
297
+ st.shared.f32 [%r102+4], %f73;
298
+ st.shared.f32 [%r102+8], %f74;
299
+ st.shared.f32 [%r102+12], %f75;
300
+ bar.sync 0;
301
+ shl.b32 %r103, %r49, 2;
302
+ add.s32 %r104, %r84, %r103;
303
+ .loc 1 36 20
304
+ shr.s32 %r106, %r59, 31;
305
+ shr.u32 %r107, %r106, 24;
306
+ add.s32 %r108, %r59, %r107;
307
+ shr.s32 %r109, %r108, 8;
308
+ and.b32 %r110, %r108, -256;
309
+ sub.s32 %r111, %r59, %r110;
310
+ .loc 1 38 30
311
+ mul.wide.s32 %rd15, %r109, 8;
312
+ add.s64 %rd6, %rd9, %rd15;
313
+ .loc 1 45 55
314
+ ld.shared.u32 %r47, [%r104];
315
+ .loc 1 38 35
316
+ mov.u64 %rd5, 0x0;
317
+ @%p1 ld.global.L1::evict_last.b64 { %rd5 }, [ %rd6 + 0 ];
318
+ .loc 1 41 32
319
+ shr.u64 %rd16, %rd5, 54;
320
+ and.b64 %rd17, %rd16, 512;
321
+ add.s64 %rd18, %rd17, %rd5;
322
+ .loc 1 45 30
323
+ shl.b64 %rd19, %rd18, 10;
324
+ add.s64 %rd20, %rd10, %rd19;
325
+ mul.wide.s32 %rd21, %r111, 4;
326
+ add.s64 %rd7, %rd20, %rd21;
327
+ .loc 1 45 55
328
+ setp.eq.s32 %p28, %r52, 0;
329
+ mov.u32 %r46, 0x0;
330
+ @%p28 atom.global.gpu.acq_rel.add.f32 %r46, [ %rd7 + 0 ], %r47;
331
+ .loc 1 45 4
332
+ ret;
333
+ $L__tmp26:
334
+ $L__func_end0:
335
+
336
+ }
337
+ .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
338
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
339
+ .section .debug_abbrev
340
+ {
341
+ .b8 1
342
+ .b8 17
343
+ .b8 1
344
+ .b8 37
345
+ .b8 8
346
+ .b8 19
347
+ .b8 5
348
+ .b8 3
349
+ .b8 8
350
+ .b8 16
351
+ .b8 6
352
+ .b8 27
353
+ .b8 8
354
+ .b8 180
355
+ .b8 66
356
+ .b8 12
357
+ .b8 17
358
+ .b8 1
359
+ .b8 18
360
+ .b8 1
361
+ .b8 0
362
+ .b8 0
363
+ .b8 2
364
+ .b8 46
365
+ .b8 0
366
+ .b8 135
367
+ .b8 64
368
+ .b8 8
369
+ .b8 3
370
+ .b8 8
371
+ .b8 58
372
+ .b8 11
373
+ .b8 59
374
+ .b8 11
375
+ .b8 63
376
+ .b8 12
377
+ .b8 32
378
+ .b8 11
379
+ .b8 0
380
+ .b8 0
381
+ .b8 3
382
+ .b8 46
383
+ .b8 1
384
+ .b8 17
385
+ .b8 1
386
+ .b8 18
387
+ .b8 1
388
+ .b8 64
389
+ .b8 10
390
+ .b8 49
391
+ .b8 19
392
+ .b8 0
393
+ .b8 0
394
+ .b8 4
395
+ .b8 29
396
+ .b8 1
397
+ .b8 49
398
+ .b8 19
399
+ .b8 17
400
+ .b8 1
401
+ .b8 18
402
+ .b8 1
403
+ .b8 88
404
+ .b8 11
405
+ .b8 89
406
+ .b8 11
407
+ .b8 87
408
+ .b8 11
409
+ .b8 0
410
+ .b8 0
411
+ .b8 5
412
+ .b8 29
413
+ .b8 0
414
+ .b8 49
415
+ .b8 19
416
+ .b8 17
417
+ .b8 1
418
+ .b8 18
419
+ .b8 1
420
+ .b8 88
421
+ .b8 11
422
+ .b8 89
423
+ .b8 11
424
+ .b8 87
425
+ .b8 11
426
+ .b8 0
427
+ .b8 0
428
+ .b8 0
429
+ }
430
+ .section .debug_info
431
+ {
432
+ .b32 264
433
+ .b8 2
434
+ .b8 0
435
+ .b32 .debug_abbrev
436
+ .b8 8
437
+ .b8 1
438
+ .b8 116
439
+ .b8 114
440
+ .b8 105
441
+ .b8 116
442
+ .b8 111
443
+ .b8 110
444
+ .b8 0
445
+ .b8 2
446
+ .b8 0
447
+ .b8 99
448
+ .b8 54
449
+ .b8 105
450
+ .b8 107
451
+ .b8 53
452
+ .b8 118
453
+ .b8 120
454
+ .b8 55
455
+ .b8 112
456
+ .b8 50
457
+ .b8 50
458
+ .b8 102
459
+ .b8 112
460
+ .b8 107
461
+ .b8 52
462
+ .b8 100
463
+ .b8 99
464
+ .b8 118
465
+ .b8 104
466
+ .b8 53
467
+ .b8 53
468
+ .b8 122
469
+ .b8 105
470
+ .b8 109
471
+ .b8 119
472
+ .b8 52
473
+ .b8 116
474
+ .b8 53
475
+ .b8 110
476
+ .b8 114
477
+ .b8 53
478
+ .b8 122
479
+ .b8 110
480
+ .b8 50
481
+ .b8 98
482
+ .b8 55
483
+ .b8 105
484
+ .b8 110
485
+ .b8 117
486
+ .b8 106
487
+ .b8 120
488
+ .b8 106
489
+ .b8 97
490
+ .b8 117
491
+ .b8 120
492
+ .b8 115
493
+ .b8 104
494
+ .b8 108
495
+ .b8 106
496
+ .b8 117
497
+ .b8 109
498
+ .b8 109
499
+ .b8 46
500
+ .b8 112
501
+ .b8 121
502
+ .b8 0
503
+ .b32 .debug_line
504
+ .b8 47
505
+ .b8 116
506
+ .b8 109
507
+ .b8 112
508
+ .b8 47
509
+ .b8 116
510
+ .b8 111
511
+ .b8 114
512
+ .b8 99
513
+ .b8 104
514
+ .b8 105
515
+ .b8 110
516
+ .b8 100
517
+ .b8 117
518
+ .b8 99
519
+ .b8 116
520
+ .b8 111
521
+ .b8 114
522
+ .b8 95
523
+ .b8 114
524
+ .b8 111
525
+ .b8 111
526
+ .b8 116
527
+ .b8 47
528
+ .b8 54
529
+ .b8 105
530
+ .b8 0
531
+ .b8 1
532
+ .b64 $L__func_begin0
533
+ .b64 $L__func_end0
534
+ .b8 2
535
+ .b8 116
536
+ .b8 114
537
+ .b8 105
538
+ .b8 116
539
+ .b8 111
540
+ .b8 110
541
+ .b8 95
542
+ .b8 95
543
+ .b8 48
544
+ .b8 100
545
+ .b8 49
546
+ .b8 100
547
+ .b8 50
548
+ .b8 100
549
+ .b8 51
550
+ .b8 100
551
+ .b8 101
552
+ .b8 52
553
+ .b8 101
554
+ .b8 0
555
+ .b8 116
556
+ .b8 114
557
+ .b8 105
558
+ .b8 116
559
+ .b8 111
560
+ .b8 110
561
+ .b8 95
562
+ .b8 95
563
+ .b8 48
564
+ .b8 100
565
+ .b8 49
566
+ .b8 100
567
+ .b8 50
568
+ .b8 100
569
+ .b8 51
570
+ .b8 100
571
+ .b8 101
572
+ .b8 52
573
+ .b8 101
574
+ .b8 0
575
+ .b8 1
576
+ .b8 18
577
+ .b8 1
578
+ .b8 1
579
+ .b8 3
580
+ .b64 $L__func_begin0
581
+ .b64 $L__func_end0
582
+ .b8 1
583
+ .b8 156
584
+ .b32 125
585
+ .b8 4
586
+ .b32 125
587
+ .b64 $L__tmp1
588
+ .b64 $L__tmp24
589
+ .b8 2
590
+ .b8 35
591
+ .b8 25
592
+ .b8 5
593
+ .b32 125
594
+ .b64 $L__tmp1
595
+ .b64 $L__tmp24
596
+ .b8 2
597
+ .b8 243
598
+ .b8 36
599
+ .b8 0
600
+ .b8 5
601
+ .b32 125
602
+ .b64 $L__tmp2
603
+ .b64 $L__tmp25
604
+ .b8 2
605
+ .b8 35
606
+ .b8 25
607
+ .b8 0
608
+ .b8 0
609
+ }
610
+ .section .debug_pubnames
611
+ {
612
+ .b32 $L__pubNames_end0-$L__pubNames_start0
613
+ $L__pubNames_start0:
614
+ .b8 2
615
+ .b8 0
616
+ .b32 .debug_info
617
+ .b32 268
618
+ .b32 125
619
+ .b8 116
620
+ .b8 114
621
+ .b8 105
622
+ .b8 116
623
+ .b8 111
624
+ .b8 110
625
+ .b8 95
626
+ .b8 95
627
+ .b8 48
628
+ .b8 100
629
+ .b8 49
630
+ .b8 100
631
+ .b8 50
632
+ .b8 100
633
+ .b8 51
634
+ .b8 100
635
+ .b8 101
636
+ .b8 52
637
+ .b8 101
638
+ .b8 0
639
+ .b32 0
640
+ $L__pubNames_end0:
641
+ }
642
+ .section .debug_pubtypes
643
+ {
644
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
645
+ $L__pubTypes_start0:
646
+ .b8 2
647
+ .b8 0
648
+ .b32 .debug_info
649
+ .b32 268
650
+ .b32 0
651
+ $L__pubTypes_end0:
652
+ }
653
+ .section .debug_loc { }
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<32x1xi64, #blocked>
6
+ %cst_0 = arith.constant dense<0> : tensor<32x1xi64, #blocked>
7
+ %cst_1 = arith.constant dense<512> : tensor<32x1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<256> : tensor<32x1xi32, #blocked>
9
+ %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
10
+ %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
11
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked1>
12
+ %cst_6 = arith.constant dense<true> : tensor<32x1xi1, #blocked>
13
+ %c32_i32 = arith.constant 32 : i32
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.muli %0, %c32_i32 : i32
16
+ %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
17
+ %3 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32x1xi32, #blocked1>
19
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xi32, #blocked>
20
+ %6 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked1>
21
+ %7 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked>
22
+ %8 = arith.addi %6, %4 : tensor<32x1xi32, #blocked1>
23
+ %9 = arith.addi %7, %5 : tensor<32x1xi32, #blocked>
24
+ %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
25
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
26
+ %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
27
+ %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
28
+ %14 = tt.broadcast %8 : (tensor<32x1xi32, #blocked1>) -> tensor<32x128xi32, #blocked1>
29
+ %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<32x128xi32, #blocked1>
30
+ %16 = arith.addi %14, %15 : tensor<32x128xi32, #blocked1>
31
+ %17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<32x128x!tt.ptr<f32, 1>, #blocked1>
32
+ %18 = tt.addptr %17, %16 : tensor<32x128x!tt.ptr<f32, 1>, #blocked1>, tensor<32x128xi32, #blocked1>
33
+ %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<32x128xi1, #blocked1>
34
+ %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32, #blocked1>
35
+ %21 = arith.addf %20, %cst_5 : tensor<32x128xf32, #blocked1>
36
+ %22 = arith.select %19, %21, %cst_5 : tensor<32x128xi1, #blocked1>, tensor<32x128xf32, #blocked1>
37
+ %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
38
+ ^bb0(%arg5: f32, %arg6: f32):
39
+ %40 = arith.addf %arg5, %arg6 : f32
40
+ tt.reduce.return %40 : f32
41
+ }) : (tensor<32x128xf32, #blocked1>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
42
+ %24 = triton_gpu.convert_layout %23 : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
43
+ %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xf32, #blocked>
44
+ %26 = arith.divsi %9, %cst_2 : tensor<32x1xi32, #blocked>
45
+ %27 = arith.remsi %9, %cst_2 : tensor<32x1xi32, #blocked>
46
+ %28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<32x1x!tt.ptr<i64, 1>, #blocked>
47
+ %29 = tt.addptr %28, %26 : tensor<32x1x!tt.ptr<i64, 1>, #blocked>, tensor<32x1xi32, #blocked>
48
+ %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64, #blocked>
49
+ %31 = arith.addi %30, %cst_1 : tensor<32x1xi64, #blocked>
50
+ %32 = arith.cmpi slt, %30, %cst_0 : tensor<32x1xi64, #blocked>
51
+ %33 = arith.select %32, %31, %30 : tensor<32x1xi1, #blocked>, tensor<32x1xi64, #blocked>
52
+ %34 = arith.muli %33, %cst : tensor<32x1xi64, #blocked>
53
+ %35 = arith.extsi %27 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked>
54
+ %36 = arith.addi %35, %34 : tensor<32x1xi64, #blocked>
55
+ %37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<32x1x!tt.ptr<f32, 1>, #blocked>
56
+ %38 = tt.addptr %37, %36 : tensor<32x1x!tt.ptr<f32, 1>, #blocked>, tensor<32x1xi64, #blocked>
57
+ %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr<f32, 1>, #blocked>, tensor<32x1xf32, #blocked>, tensor<32x1xi1, #blocked>) -> tensor<32x1xf32, #blocked>
58
+ tt.return
59
+ }
60
+ }
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<256> : tensor<32x1xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<32x1xi64>
5
+ %cst_1 = arith.constant dense<512> : tensor<32x1xi64>
6
+ %cst_2 = arith.constant dense<true> : tensor<32x1xi1>
7
+ %cst_3 = arith.constant dense<256> : tensor<32x1xi32>
8
+ %cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
9
+ %cst_5 = arith.constant dense<120> : tensor<1x128xi32>
10
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<32x128xf32>
11
+ %c32_i32 = arith.constant 32 : i32
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.muli %0, %c32_i32 : i32
14
+ %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
15
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32>) -> tensor<32x1xi32>
16
+ %4 = tt.splat %1 : (i32) -> tensor<32x1xi32>
17
+ %5 = arith.addi %4, %3 : tensor<32x1xi32>
18
+ %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
19
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
20
+ %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
21
+ %9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
22
+ %10 = tt.broadcast %5 : (tensor<32x1xi32>) -> tensor<32x128xi32>
23
+ %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<32x128xi32>
24
+ %12 = arith.addi %10, %11 : tensor<32x128xi32>
25
+ %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<32x128x!tt.ptr<f32, 1>>
26
+ %14 = tt.addptr %13, %12 : tensor<32x128x!tt.ptr<f32, 1>>, tensor<32x128xi32>
27
+ %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<32x128xi1>
28
+ %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32>
29
+ %17 = arith.addf %16, %cst_6 : tensor<32x128xf32>
30
+ %18 = arith.select %15, %17, %cst_6 : tensor<32x128xi1>, tensor<32x128xf32>
31
+ %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
32
+ ^bb0(%arg5: f32, %arg6: f32):
33
+ %35 = arith.addf %arg5, %arg6 : f32
34
+ tt.reduce.return %35 : f32
35
+ }) : (tensor<32x128xf32>) -> tensor<32xf32>
36
+ %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<32xf32>) -> tensor<32x1xf32>
37
+ %21 = arith.divsi %5, %cst_3 : tensor<32x1xi32>
38
+ %22 = arith.remsi %5, %cst_3 : tensor<32x1xi32>
39
+ %23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<32x1x!tt.ptr<i64, 1>>
40
+ %24 = tt.addptr %23, %21 : tensor<32x1x!tt.ptr<i64, 1>>, tensor<32x1xi32>
41
+ %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64>
42
+ %26 = arith.addi %25, %cst_1 : tensor<32x1xi64>
43
+ %27 = arith.cmpi slt, %25, %cst_0 : tensor<32x1xi64>
44
+ %28 = arith.select %27, %26, %25 : tensor<32x1xi1>, tensor<32x1xi64>
45
+ %29 = arith.muli %28, %cst : tensor<32x1xi64>
46
+ %30 = arith.extsi %22 : tensor<32x1xi32> to tensor<32x1xi64>
47
+ %31 = arith.addi %30, %29 : tensor<32x1xi64>
48
+ %32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<32x1x!tt.ptr<f32, 1>>
49
+ %33 = tt.addptr %32, %31 : tensor<32x1x!tt.ptr<f32, 1>>, tensor<32x1xi64>
50
+ %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr<f32, 1>>, tensor<32x1xf32>, tensor<32x1xi1>) -> tensor<32x1xf32>
51
+ tt.return
52
+ }
53
+ }
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %c1024_i32 = arith.constant 1024 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c1024_i32 : i32
8
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
9
+ %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
10
+ %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
11
+ %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
12
+ %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
13
+ %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
14
+ %8 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
15
+ %9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
16
+ %10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
17
+ %11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
18
+ %12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
19
+ %13 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
20
+ %14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
21
+ tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
22
+ tt.return
23
+ }
24
+ }
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin ADDED
Binary file (14.1 kB). View file
 
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
6
+ %cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
7
+ %cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
8
+ %cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
9
+ %c0_i32 = arith.constant 0 : i32
10
+ %c128_i32 = arith.constant 128 : i32
11
+ %c8_i32 = arith.constant 8 : i32
12
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
13
+ %c64_i32 = arith.constant 64 : i32
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.muli %0, %c64_i32 : i32
16
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
17
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
18
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
19
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
20
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
21
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
22
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
23
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
24
+ %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
25
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
26
+ %12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
27
+ %13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked>
28
+ %14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
29
+ %15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked>
30
+ %16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
31
+ %17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
32
+ %18 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
33
+ %19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 {
34
+ %25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked>
35
+ %26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked>
36
+ %27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked>
37
+ %28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked>
38
+ %29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
39
+ %30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked>
40
+ %31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked>
41
+ %32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
42
+ %33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
43
+ %34 = tt.load %32, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
44
+ %35 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
45
+ %36 = tt.load %35, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
46
+ %37 = arith.mulf %34, %36 : tensor<64x8xf32, #blocked>
47
+ %38 = arith.addf %arg6, %37 : tensor<64x8xf32, #blocked>
48
+ %39 = arith.select %33, %38, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
49
+ scf.yield %39 : tensor<64x8xf32, #blocked>
50
+ }
51
+ %20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({
52
+ ^bb0(%arg5: f32, %arg6: f32):
53
+ %25 = arith.addf %arg5, %arg6 : f32
54
+ tt.reduce.return %25 : f32
55
+ }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
56
+ %21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
57
+ %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1>
58
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked1>
59
+ %24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr<f32, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
60
+ tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1>
61
+ tt.return
62
+ }
63
+ }
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx ADDED
@@ -0,0 +1,577 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4de(
13
+ .param .u64 triton__0d1d2d3de4de_param_0,
14
+ .param .u64 triton__0d1d2d3de4de_param_1,
15
+ .param .u64 triton__0d1d2d3de4de_param_2,
16
+ .param .u32 triton__0d1d2d3de4de_param_3,
17
+ .param .u32 triton__0d1d2d3de4de_param_4
18
+ )
19
+ .maxntid 128, 1, 1
20
+ {
21
+ .reg .pred %p<20>;
22
+ .reg .b16 %rs<5>;
23
+ .reg .b32 %r<98>;
24
+ .reg .f32 %f<47>;
25
+ .reg .b64 %rd<10>;
26
+ .loc 1 18 0
27
+ $L__func_begin0:
28
+ .loc 1 18 0
29
+
30
+ ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2];
31
+ ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1];
32
+ ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0];
33
+ $L__tmp0:
34
+ .loc 1 22 44
35
+ mov.u32 %r1, %tid.x;
36
+ and.b32 %r2, %r1, 31;
37
+ shl.b32 %r13, %r1, 2;
38
+ and.b32 %r3, %r13, 60;
39
+ .loc 1 24 33
40
+ bfe.u32 %r4, %r1, 5, 2;
41
+ .loc 1 21 28
42
+ mov.u32 %r11, %ctaid.x;
43
+ .loc 1 21 33
44
+ shl.b32 %r5, %r11, 6;
45
+ .loc 1 22 23
46
+ or.b32 %r14, %r5, %r3;
47
+ .loc 1 26 20
48
+ shr.s32 %r16, %r14, 31;
49
+ shr.u32 %r17, %r16, 24;
50
+ add.s32 %r18, %r14, %r17;
51
+ shr.s32 %r19, %r18, 8;
52
+ .loc 1 29 36
53
+ mad.lo.s32 %r20, %r19, 32512, %r14;
54
+ shl.b32 %r21, %r4, 9;
55
+ add.s32 %r22, %r20, %r21;
56
+ shl.b32 %r23, %r1, 4;
57
+ and.b32 %r24, %r23, 256;
58
+ add.s32 %r96, %r22, %r24;
59
+ mov.f32 %f43, 0f00000000;
60
+ mov.b32 %r97, -8;
61
+ mov.pred %p1, -1;
62
+ mov.f32 %f44, %f43;
63
+ mov.f32 %f45, %f43;
64
+ mov.f32 %f46, %f43;
65
+ $L__BB0_1:
66
+ .loc 1 33 34
67
+ mul.wide.s32 %rd6, %r96, 2;
68
+ add.s64 %rd4, %rd1, %rd6;
69
+ mov.b32 %r27, 0;
70
+ .loc 1 33 63
71
+ mov.u32 %r25, 0x0;
72
+ mov.u32 %r26, 0x0;
73
+ @%p1 ld.global.L1::evict_first.v2.b32 { %r25, %r26 }, [ %rd4 + 0 ];
74
+ @!%p1 mov.u32 %r25, %r27;
75
+ @!%p1 mov.u32 %r26, %r27;
76
+ cvt.u16.u32 %rs1, %r25;
77
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r25; }
78
+ cvt.u16.u32 %rs3, %r26;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r26; }
80
+ .loc 1 33 115
81
+ cvt.f32.bf16 %r29, %rs1;
82
+ mov.b32 %f13, %r29;
83
+ cvt.f32.bf16 %r30, %rs2;
84
+ mov.b32 %f14, %r30;
85
+ cvt.f32.bf16 %r31, %rs3;
86
+ mov.b32 %f15, %r31;
87
+ cvt.f32.bf16 %r32, %rs4;
88
+ mov.b32 %f16, %r32;
89
+ .loc 1 34 34
90
+ mul.wide.s32 %rd7, %r96, 4;
91
+ add.s64 %rd5, %rd2, %rd7;
92
+ .loc 1 34 63
93
+ mov.u32 %r33, 0x0;
94
+ mov.u32 %r34, 0x0;
95
+ mov.u32 %r35, 0x0;
96
+ mov.u32 %r36, 0x0;
97
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ];
98
+ @!%p1 mov.u32 %r33, %r27;
99
+ @!%p1 mov.u32 %r34, %r27;
100
+ @!%p1 mov.u32 %r35, %r27;
101
+ @!%p1 mov.u32 %r36, %r27;
102
+ mov.b32 %f17, %r33;
103
+ mov.b32 %f18, %r34;
104
+ mov.b32 %f19, %r35;
105
+ mov.b32 %f20, %r36;
106
+ .loc 1 39 38
107
+ fma.rn.f32 %f46, %f16, %f20, %f46;
108
+ fma.rn.f32 %f45, %f15, %f19, %f45;
109
+ fma.rn.f32 %f44, %f14, %f18, %f44;
110
+ fma.rn.f32 %f43, %f13, %f17, %f43;
111
+ .loc 1 29 36
112
+ add.s32 %r97, %r97, 8;
113
+ add.s32 %r96, %r96, 2048;
114
+ setp.lt.u32 %p9, %r97, 120;
115
+ @%p9 bra $L__BB0_1;
116
+ .loc 1 22 44
117
+ and.b32 %r58, %r1, 63;
118
+ .loc 1 22 23
119
+ or.b32 %r59, %r5, %r58;
120
+ $L__tmp1:
121
+ .loc 2 243 36
122
+ mov.b32 %r60, %f43;
123
+ shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1;
124
+ mov.b32 %f21, %r61;
125
+ $L__tmp2:
126
+ .loc 2 233 15
127
+ add.f32 %f22, %f43, %f21;
128
+ $L__tmp3:
129
+ .loc 2 243 36
130
+ mov.b32 %r62, %f44;
131
+ shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1;
132
+ mov.b32 %f23, %r63;
133
+ $L__tmp4:
134
+ .loc 2 233 15
135
+ add.f32 %f24, %f44, %f23;
136
+ $L__tmp5:
137
+ .loc 2 243 36
138
+ mov.b32 %r64, %f45;
139
+ shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1;
140
+ mov.b32 %f25, %r65;
141
+ $L__tmp6:
142
+ .loc 2 233 15
143
+ add.f32 %f26, %f45, %f25;
144
+ $L__tmp7:
145
+ .loc 2 243 36
146
+ mov.b32 %r66, %f46;
147
+ shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
148
+ mov.b32 %f27, %r67;
149
+ $L__tmp8:
150
+ .loc 2 233 15
151
+ add.f32 %f28, %f46, %f27;
152
+ $L__tmp9:
153
+ .loc 2 243 36
154
+ setp.lt.u32 %p10, %r2, 16;
155
+ shl.b32 %r68, %r3, 2;
156
+ or.b32 %r69, %r68, %r4;
157
+ shl.b32 %r70, %r69, 2;
158
+ mov.u32 %r71, global_smem;
159
+ add.s32 %r41, %r71, %r70;
160
+ mov.b32 %r42, %f22;
161
+ @%p10 st.shared.b32 [ %r41 + 0 ], %r42;
162
+ shl.b32 %r72, %r4, 2;
163
+ shl.b32 %r73, %r3, 4;
164
+ or.b32 %r74, %r73, 16;
165
+ or.b32 %r75, %r74, %r72;
166
+ add.s32 %r43, %r71, %r75;
167
+ mov.b32 %r44, %f24;
168
+ @%p10 st.shared.b32 [ %r43 + 0 ], %r44;
169
+ or.b32 %r76, %r73, 32;
170
+ or.b32 %r77, %r76, %r72;
171
+ add.s32 %r45, %r71, %r77;
172
+ mov.b32 %r46, %f26;
173
+ @%p10 st.shared.b32 [ %r45 + 0 ], %r46;
174
+ or.b32 %r78, %r73, 48;
175
+ or.b32 %r79, %r78, %r72;
176
+ add.s32 %r47, %r71, %r79;
177
+ mov.b32 %r48, %f28;
178
+ @%p10 st.shared.b32 [ %r47 + 0 ], %r48;
179
+ bar.sync 0;
180
+ setp.lt.s32 %p14, %r1, 256;
181
+ add.s32 %r50, %r71, %r13;
182
+ @%p14 ld.shared.b32 %r49, [ %r50 + 0 ];
183
+ mov.b32 %f29, %r49;
184
+ shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1;
185
+ mov.b32 %f30, %r81;
186
+ $L__tmp10:
187
+ .loc 2 233 15
188
+ add.f32 %f31, %f29, %f30;
189
+ $L__tmp11:
190
+ .loc 2 243 36
191
+ mov.b32 %r82, %f31;
192
+ shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
193
+ mov.b32 %f32, %r83;
194
+ $L__tmp12:
195
+ .loc 2 233 15
196
+ add.f32 %f33, %f31, %f32;
197
+ $L__tmp13:
198
+ .loc 2 243 36
199
+ and.b32 %r84, %r1, 3;
200
+ setp.eq.s32 %p19, %r84, 0;
201
+ and.pred %p15, %p14, %p19;
202
+ mov.b32 %r52, %f33;
203
+ @%p15 st.shared.b32 [ %r50 + 0 ], %r52;
204
+ add.s32 %r54, %r50, 512;
205
+ @%p14 ld.shared.b32 %r53, [ %r54 + 0 ];
206
+ mov.b32 %f34, %r53;
207
+ shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1;
208
+ mov.b32 %f35, %r85;
209
+ $L__tmp14:
210
+ .loc 2 233 15
211
+ add.f32 %f36, %f34, %f35;
212
+ $L__tmp15:
213
+ .loc 2 243 36
214
+ mov.b32 %r86, %f36;
215
+ shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
216
+ mov.b32 %f37, %r87;
217
+ $L__tmp16:
218
+ .loc 2 233 15
219
+ add.f32 %f38, %f36, %f37;
220
+ $L__tmp17:
221
+ .loc 2 243 36
222
+ mov.b32 %r56, %f38;
223
+ @%p15 st.shared.b32 [ %r54 + 0 ], %r56;
224
+ bar.sync 0;
225
+ add.s32 %r88, %r71, %r73;
226
+ ld.shared.f32 %f39, [%r88];
227
+ add.s32 %r89, %r71, %r74;
228
+ ld.shared.f32 %f40, [%r89];
229
+ add.s32 %r90, %r71, %r76;
230
+ ld.shared.f32 %f41, [%r90];
231
+ add.s32 %r91, %r71, %r78;
232
+ ld.shared.f32 %f42, [%r91];
233
+ $L__tmp18:
234
+ .loc 1 40 28
235
+ bar.sync 0;
236
+ add.s32 %r92, %r71, %r68;
237
+ st.shared.f32 [%r92], %f39;
238
+ st.shared.f32 [%r92+4], %f40;
239
+ st.shared.f32 [%r92+8], %f41;
240
+ st.shared.f32 [%r92+12], %f42;
241
+ bar.sync 0;
242
+ shl.b32 %r93, %r58, 2;
243
+ add.s32 %r94, %r71, %r93;
244
+ ld.shared.u32 %r57, [%r94];
245
+ .loc 1 41 25
246
+ mul.wide.s32 %rd9, %r59, 4;
247
+ add.s64 %rd8, %rd3, %rd9;
248
+ .loc 1 41 36
249
+ and.b32 %r95, %r1, 64;
250
+ setp.eq.s32 %p18, %r95, 0;
251
+ @%p18 st.global.b32 [ %rd8 + 0 ], { %r57 };
252
+ .loc 1 41 4
253
+ ret;
254
+ $L__tmp19:
255
+ $L__func_end0:
256
+
257
+ }
258
+ .file 1 "/tmp/torchinductor_root/sj/csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py"
259
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
260
+ .section .debug_abbrev
261
+ {
262
+ .b8 1
263
+ .b8 17
264
+ .b8 1
265
+ .b8 37
266
+ .b8 8
267
+ .b8 19
268
+ .b8 5
269
+ .b8 3
270
+ .b8 8
271
+ .b8 16
272
+ .b8 6
273
+ .b8 27
274
+ .b8 8
275
+ .b8 180
276
+ .b8 66
277
+ .b8 12
278
+ .b8 17
279
+ .b8 1
280
+ .b8 18
281
+ .b8 1
282
+ .b8 0
283
+ .b8 0
284
+ .b8 2
285
+ .b8 46
286
+ .b8 0
287
+ .b8 135
288
+ .b8 64
289
+ .b8 8
290
+ .b8 3
291
+ .b8 8
292
+ .b8 58
293
+ .b8 11
294
+ .b8 59
295
+ .b8 11
296
+ .b8 63
297
+ .b8 12
298
+ .b8 32
299
+ .b8 11
300
+ .b8 0
301
+ .b8 0
302
+ .b8 3
303
+ .b8 46
304
+ .b8 1
305
+ .b8 17
306
+ .b8 1
307
+ .b8 18
308
+ .b8 1
309
+ .b8 64
310
+ .b8 10
311
+ .b8 49
312
+ .b8 19
313
+ .b8 0
314
+ .b8 0
315
+ .b8 4
316
+ .b8 29
317
+ .b8 0
318
+ .b8 49
319
+ .b8 19
320
+ .b8 17
321
+ .b8 1
322
+ .b8 18
323
+ .b8 1
324
+ .b8 88
325
+ .b8 11
326
+ .b8 89
327
+ .b8 11
328
+ .b8 87
329
+ .b8 11
330
+ .b8 0
331
+ .b8 0
332
+ .b8 5
333
+ .b8 29
334
+ .b8 1
335
+ .b8 49
336
+ .b8 19
337
+ .b8 17
338
+ .b8 1
339
+ .b8 18
340
+ .b8 1
341
+ .b8 88
342
+ .b8 11
343
+ .b8 89
344
+ .b8 11
345
+ .b8 87
346
+ .b8 11
347
+ .b8 0
348
+ .b8 0
349
+ .b8 0
350
+ }
351
+ .section .debug_info
352
+ {
353
+ .b32 266
354
+ .b8 2
355
+ .b8 0
356
+ .b32 .debug_abbrev
357
+ .b8 8
358
+ .b8 1
359
+ .b8 116
360
+ .b8 114
361
+ .b8 105
362
+ .b8 116
363
+ .b8 111
364
+ .b8 110
365
+ .b8 0
366
+ .b8 2
367
+ .b8 0
368
+ .b8 99
369
+ .b8 115
370
+ .b8 106
371
+ .b8 100
372
+ .b8 55
373
+ .b8 109
374
+ .b8 108
375
+ .b8 114
376
+ .b8 106
377
+ .b8 117
378
+ .b8 106
379
+ .b8 100
380
+ .b8 52
381
+ .b8 117
382
+ .b8 119
383
+ .b8 122
384
+ .b8 101
385
+ .b8 53
386
+ .b8 116
387
+ .b8 107
388
+ .b8 103
389
+ .b8 55
390
+ .b8 112
391
+ .b8 116
392
+ .b8 116
393
+ .b8 101
394
+ .b8 97
395
+ .b8 103
396
+ .b8 112
397
+ .b8 105
398
+ .b8 104
399
+ .b8 103
400
+ .b8 116
401
+ .b8 53
402
+ .b8 122
403
+ .b8 116
404
+ .b8 97
405
+ .b8 116
406
+ .b8 102
407
+ .b8 113
408
+ .b8 99
409
+ .b8 104
410
+ .b8 112
411
+ .b8 114
412
+ .b8 99
413
+ .b8 114
414
+ .b8 97
415
+ .b8 120
416
+ .b8 50
417
+ .b8 50
418
+ .b8 108
419
+ .b8 115
420
+ .b8 46
421
+ .b8 112
422
+ .b8 121
423
+ .b8 0
424
+ .b32 .debug_line
425
+ .b8 47
426
+ .b8 116
427
+ .b8 109
428
+ .b8 112
429
+ .b8 47
430
+ .b8 116
431
+ .b8 111
432
+ .b8 114
433
+ .b8 99
434
+ .b8 104
435
+ .b8 105
436
+ .b8 110
437
+ .b8 100
438
+ .b8 117
439
+ .b8 99
440
+ .b8 116
441
+ .b8 111
442
+ .b8 114
443
+ .b8 95
444
+ .b8 114
445
+ .b8 111
446
+ .b8 111
447
+ .b8 116
448
+ .b8 47
449
+ .b8 115
450
+ .b8 106
451
+ .b8 0
452
+ .b8 1
453
+ .b64 $L__func_begin0
454
+ .b64 $L__func_end0
455
+ .b8 2
456
+ .b8 116
457
+ .b8 114
458
+ .b8 105
459
+ .b8 116
460
+ .b8 111
461
+ .b8 110
462
+ .b8 95
463
+ .b8 95
464
+ .b8 48
465
+ .b8 100
466
+ .b8 49
467
+ .b8 100
468
+ .b8 50
469
+ .b8 100
470
+ .b8 51
471
+ .b8 100
472
+ .b8 101
473
+ .b8 52
474
+ .b8 100
475
+ .b8 101
476
+ .b8 0
477
+ .b8 116
478
+ .b8 114
479
+ .b8 105
480
+ .b8 116
481
+ .b8 111
482
+ .b8 110
483
+ .b8 95
484
+ .b8 95
485
+ .b8 48
486
+ .b8 100
487
+ .b8 49
488
+ .b8 100
489
+ .b8 50
490
+ .b8 100
491
+ .b8 51
492
+ .b8 100
493
+ .b8 101
494
+ .b8 52
495
+ .b8 100
496
+ .b8 101
497
+ .b8 0
498
+ .b8 1
499
+ .b8 18
500
+ .b8 1
501
+ .b8 1
502
+ .b8 3
503
+ .b64 $L__func_begin0
504
+ .b64 $L__func_end0
505
+ .b8 1
506
+ .b8 156
507
+ .b32 125
508
+ .b8 4
509
+ .b32 125
510
+ .b64 $L__tmp1
511
+ .b64 $L__tmp18
512
+ .b8 2
513
+ .b8 40
514
+ .b8 25
515
+ .b8 5
516
+ .b32 125
517
+ .b64 $L__tmp2
518
+ .b64 $L__tmp17
519
+ .b8 2
520
+ .b8 40
521
+ .b8 25
522
+ .b8 4
523
+ .b32 125
524
+ .b64 $L__tmp2
525
+ .b64 $L__tmp17
526
+ .b8 2
527
+ .b8 243
528
+ .b8 36
529
+ .b8 0
530
+ .b8 0
531
+ .b8 0
532
+ }
533
+ .section .debug_pubnames
534
+ {
535
+ .b32 $L__pubNames_end0-$L__pubNames_start0
536
+ $L__pubNames_start0:
537
+ .b8 2
538
+ .b8 0
539
+ .b32 .debug_info
540
+ .b32 270
541
+ .b32 125
542
+ .b8 116
543
+ .b8 114
544
+ .b8 105
545
+ .b8 116
546
+ .b8 111
547
+ .b8 110
548
+ .b8 95
549
+ .b8 95
550
+ .b8 48
551
+ .b8 100
552
+ .b8 49
553
+ .b8 100
554
+ .b8 50
555
+ .b8 100
556
+ .b8 51
557
+ .b8 100
558
+ .b8 101
559
+ .b8 52
560
+ .b8 100
561
+ .b8 101
562
+ .b8 0
563
+ .b32 0
564
+ $L__pubNames_end0:
565
+ }
566
+ .section .debug_pubtypes
567
+ {
568
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
569
+ $L__pubTypes_start0:
570
+ .b8 2
571
+ .b8 0
572
+ .b32 .debug_info
573
+ .b32 270
574
+ .b32 0
575
+ $L__pubTypes_end0:
576
+ }
577
+ .section .debug_loc { }
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
6
+ %cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
7
+ %cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
8
+ %cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
9
+ %c0_i32 = arith.constant 0 : i32
10
+ %c128_i32 = arith.constant 128 : i32
11
+ %c8_i32 = arith.constant 8 : i32
12
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
13
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
14
+ %c64_i32 = arith.constant 64 : i32
15
+ %0 = tt.get_program_id x : i32
16
+ %1 = arith.muli %0, %c64_i32 : i32
17
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
19
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
20
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
21
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
22
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
23
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
24
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
25
+ %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
26
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
27
+ %12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
28
+ %13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked>
29
+ %14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
30
+ %15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked>
31
+ %16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
32
+ %17 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
33
+ %18 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
34
+ %19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 {
35
+ %25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked>
36
+ %26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked>
37
+ %27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked>
38
+ %28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked>
39
+ %29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
40
+ %30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked>
41
+ %31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked>
42
+ %32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
43
+ %33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
44
+ %34 = tt.load %32, %33, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
45
+ %35 = arith.extf %34 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
46
+ %36 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
47
+ %37 = tt.load %36, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
48
+ %38 = arith.mulf %35, %37 : tensor<64x8xf32, #blocked>
49
+ %39 = arith.addf %arg6, %38 : tensor<64x8xf32, #blocked>
50
+ %40 = arith.select %33, %39, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
51
+ scf.yield %40 : tensor<64x8xf32, #blocked>
52
+ }
53
+ %20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({
54
+ ^bb0(%arg5: f32, %arg6: f32):
55
+ %25 = arith.addf %arg5, %arg6 : f32
56
+ tt.reduce.return %25 : f32
57
+ }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
58
+ %21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
59
+ %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1>
60
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked1>
61
+ %24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr<f32, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
62
+ tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1>
63
+ tt.return
64
+ }
65
+ }
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin ADDED
Binary file (4.52 kB). View file
 
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7de8de(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_6,
20
+ .param .u32 triton__0d1d2d3d4d5d6d7de8de_param_7,
21
+ .param .u32 triton__0d1d2d3d4d5d6d7de8de_param_8
22
+ )
23
+ .maxntid 64, 1, 1
24
+ {
25
+ .reg .pred %p<37>;
26
+ .reg .b16 %rs<9>;
27
+ .reg .b32 %r<110>;
28
+ .reg .f32 %f<86>;
29
+ .reg .b64 %rd<26>;
30
+ .loc 1 18 0
31
+ $L__func_begin0:
32
+ .loc 1 18 0
33
+
34
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8de_param_0];
35
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8de_param_1];
36
+ $L__tmp0:
37
+ .loc 1 26 26
38
+ mov.u32 %r76, %tid.x;
39
+ and.b32 %r77, %r76, 31;
40
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8de_param_2];
41
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8de_param_3];
42
+ ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8de_param_4];
43
+ shl.b32 %r78, %r76, 2;
44
+ ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8de_param_5];
45
+ and.b32 %r79, %r78, 252;
46
+ ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7de8de_param_6];
47
+ .loc 1 23 28
48
+ mov.u32 %r1, %ctaid.x;
49
+ .loc 1 30 40
50
+ shl.b32 %r80, %r1, 8;
51
+ .loc 1 30 36
52
+ or.b32 %r81, %r80, %r79;
53
+ .loc 1 30 30
54
+ mul.wide.s32 %rd22, %r81, 2;
55
+ add.s64 %rd1, %rd16, %rd22;
56
+ mov.b32 %r4, 0;
57
+ mov.pred %p1, -1;
58
+ .loc 1 30 46
59
+ mov.u32 %r2, 0x0;
60
+ mov.u32 %r3, 0x0;
61
+ @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
62
+ @!%p1 mov.u32 %r2, %r4;
63
+ @!%p1 mov.u32 %r3, %r4;
64
+ cvt.u16.u32 %rs1, %r2;
65
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
66
+ cvt.u16.u32 %rs3, %r3;
67
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
68
+ .loc 1 30 67
69
+ cvt.f32.bf16 %r6, %rs1;
70
+ mov.b32 %f1, %r6;
71
+ cvt.f32.bf16 %r7, %rs2;
72
+ mov.b32 %f2, %r7;
73
+ cvt.f32.bf16 %r8, %rs3;
74
+ mov.b32 %f3, %r8;
75
+ cvt.f32.bf16 %r9, %rs4;
76
+ mov.b32 %f4, %r9;
77
+ .loc 1 31 30
78
+ mul.wide.u32 %rd23, %r79, 4;
79
+ add.s64 %rd2, %rd17, %rd23;
80
+ .loc 1 31 35
81
+ mov.u32 %r10, 0x0;
82
+ mov.u32 %r11, 0x0;
83
+ mov.u32 %r12, 0x0;
84
+ mov.u32 %r13, 0x0;
85
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
86
+ @!%p1 mov.u32 %r10, %r4;
87
+ @!%p1 mov.u32 %r11, %r4;
88
+ @!%p1 mov.u32 %r12, %r4;
89
+ @!%p1 mov.u32 %r13, %r4;
90
+ mov.b32 %f5, %r10;
91
+ mov.b32 %f6, %r11;
92
+ mov.b32 %f7, %r12;
93
+ mov.b32 %f8, %r13;
94
+ .loc 1 32 30
95
+ mul.wide.s32 %rd24, %r81, 4;
96
+ add.s64 %rd3, %rd18, %rd24;
97
+ .loc 1 32 46
98
+ mov.u32 %r18, 0x0;
99
+ mov.u32 %r19, 0x0;
100
+ mov.u32 %r20, 0x0;
101
+ mov.u32 %r21, 0x0;
102
+ @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
103
+ @!%p1 mov.u32 %r18, %r4;
104
+ @!%p1 mov.u32 %r19, %r4;
105
+ @!%p1 mov.u32 %r20, %r4;
106
+ @!%p1 mov.u32 %r21, %r4;
107
+ mov.b32 %f9, %r18;
108
+ mov.b32 %f10, %r19;
109
+ mov.b32 %f11, %r20;
110
+ mov.b32 %f12, %r21;
111
+ .loc 1 33 30
112
+ mul.wide.s32 %rd25, %r1, 4;
113
+ add.s64 %rd4, %rd19, %rd25;
114
+ .loc 1 33 35
115
+ mov.u32 %r26, 0x0;
116
+ @%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ];
117
+ mov.b32 %f13, %r26;
118
+ mov.u32 %r27, 0x0;
119
+ @%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ];
120
+ mov.u32 %r28, 0x0;
121
+ @%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ];
122
+ mov.u32 %r29, 0x0;
123
+ @%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ];
124
+ .loc 1 34 31
125
+ add.s64 %rd8, %rd20, %rd25;
126
+ .loc 1 34 36
127
+ mov.u32 %r55, 0x0;
128
+ @%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ];
129
+ mov.b32 %f14, %r55;
130
+ mov.u32 %r31, 0x0;
131
+ @%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ];
132
+ mov.u32 %r32, 0x0;
133
+ @%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ];
134
+ mov.u32 %r33, 0x0;
135
+ @%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ];
136
+ .loc 1 35 35
137
+ add.s64 %rd12, %rd15, %rd24;
138
+ .loc 1 35 51
139
+ mov.u32 %r34, 0x0;
140
+ mov.u32 %r35, 0x0;
141
+ mov.u32 %r36, 0x0;
142
+ mov.u32 %r37, 0x0;
143
+ @%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd12 + 0 ];
144
+ @!%p1 mov.u32 %r34, %r4;
145
+ @!%p1 mov.u32 %r35, %r4;
146
+ @!%p1 mov.u32 %r36, %r4;
147
+ @!%p1 mov.u32 %r37, %r4;
148
+ mov.b32 %f15, %r34;
149
+ mov.b32 %f16, %r35;
150
+ mov.b32 %f17, %r36;
151
+ mov.b32 %f18, %r37;
152
+ .loc 1 37 18
153
+ mul.f32 %f19, %f1, %f5;
154
+ mul.f32 %f20, %f2, %f6;
155
+ mul.f32 %f21, %f3, %f7;
156
+ mul.f32 %f22, %f4, %f8;
157
+ $L__tmp1:
158
+ .loc 2 233 15
159
+ fma.rn.f32 %f23, %f1, %f5, %f20;
160
+ fma.rn.f32 %f24, %f3, %f7, %f23;
161
+ fma.rn.f32 %f25, %f4, %f8, %f24;
162
+ $L__tmp2:
163
+ .loc 2 243 36
164
+ mov.b32 %r82, %f25;
165
+ shfl.sync.bfly.b32 %r83, %r82, 16, 31, -1;
166
+ mov.b32 %f26, %r83;
167
+ $L__tmp3:
168
+ .loc 2 233 15
169
+ add.f32 %f27, %f25, %f26;
170
+ $L__tmp4:
171
+ .loc 2 243 36
172
+ mov.b32 %r84, %f27;
173
+ shfl.sync.bfly.b32 %r85, %r84, 8, 31, -1;
174
+ mov.b32 %f28, %r85;
175
+ $L__tmp5:
176
+ .loc 2 233 15
177
+ add.f32 %f29, %f27, %f28;
178
+ $L__tmp6:
179
+ .loc 2 243 36
180
+ mov.b32 %r86, %f29;
181
+ shfl.sync.bfly.b32 %r87, %r86, 4, 31, -1;
182
+ mov.b32 %f30, %r87;
183
+ $L__tmp7:
184
+ .loc 2 233 15
185
+ add.f32 %f31, %f29, %f30;
186
+ $L__tmp8:
187
+ .loc 2 243 36
188
+ mov.b32 %r88, %f31;
189
+ shfl.sync.bfly.b32 %r89, %r88, 2, 31, -1;
190
+ mov.b32 %f32, %r89;
191
+ $L__tmp9:
192
+ .loc 2 233 15
193
+ add.f32 %f33, %f31, %f32;
194
+ $L__tmp10:
195
+ .loc 2 243 36
196
+ mov.b32 %r90, %f33;
197
+ shfl.sync.bfly.b32 %r91, %r90, 1, 31, -1;
198
+ mov.b32 %f34, %r91;
199
+ $L__tmp11:
200
+ .loc 2 233 15
201
+ add.f32 %f35, %f33, %f34;
202
+ $L__tmp12:
203
+ .loc 2 243 36
204
+ setp.eq.s32 %p27, %r77, 0;
205
+ shr.u32 %r92, %r76, 3;
206
+ and.b32 %r93, %r92, 4;
207
+ mov.u32 %r94, global_smem;
208
+ add.s32 %r42, %r94, %r93;
209
+ mov.b32 %r43, %f35;
210
+ @%p27 st.shared.b32 [ %r42 + 0 ], %r43;
211
+ bar.sync 0;
212
+ setp.lt.s32 %p28, %r76, 2;
213
+ add.s32 %r45, %r94, %r78;
214
+ @%p28 ld.shared.b32 %r44, [ %r45 + 0 ];
215
+ mov.b32 %f36, %r44;
216
+ shfl.sync.bfly.b32 %r95, %r44, 1, 31, -1;
217
+ mov.b32 %f37, %r95;
218
+ $L__tmp13:
219
+ .loc 2 233 15
220
+ add.f32 %f38, %f36, %f37;
221
+ $L__tmp14:
222
+ .loc 2 243 36
223
+ and.b32 %r96, %r76, 1;
224
+ setp.eq.b32 %p35, %r96, 1;
225
+ not.pred %p36, %p35;
226
+ and.pred %p29, %p28, %p36;
227
+ mov.b32 %r47, %f38;
228
+ @%p29 st.shared.b32 [ %r45 + 0 ], %r47;
229
+ bar.sync 0;
230
+ ld.shared.f32 %f39, [global_smem];
231
+ $L__tmp15:
232
+ .loc 3 8 15
233
+ add.f32 %f40, %f39, 0f00000000;
234
+ $L__tmp16:
235
+ .loc 1 41 19
236
+ sub.f32 %f41, %f9, %f13;
237
+ sub.f32 %f42, %f10, %f13;
238
+ sub.f32 %f43, %f11, %f13;
239
+ sub.f32 %f44, %f12, %f13;
240
+ .loc 1 42 20
241
+ mul.f32 %f45, %f41, %f14;
242
+ mul.f32 %f46, %f42, %f14;
243
+ mul.f32 %f47, %f43, %f14;
244
+ mul.f32 %f48, %f44, %f14;
245
+ .loc 1 43 19
246
+ mul.f32 %f49, %f20, %f46;
247
+ $L__tmp17:
248
+ .loc 2 243 36
249
+ bar.sync 0;
250
+ $L__tmp18:
251
+ .loc 2 233 15
252
+ fma.rn.f32 %f50, %f19, %f45, %f49;
253
+ fma.rn.f32 %f51, %f21, %f47, %f50;
254
+ fma.rn.f32 %f52, %f22, %f48, %f51;
255
+ $L__tmp19:
256
+ .loc 2 243 36
257
+ mov.b32 %r97, %f52;
258
+ shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1;
259
+ mov.b32 %f53, %r98;
260
+ $L__tmp20:
261
+ .loc 2 233 15
262
+ add.f32 %f54, %f52, %f53;
263
+ $L__tmp21:
264
+ .loc 2 243 36
265
+ mov.b32 %r99, %f54;
266
+ shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1;
267
+ mov.b32 %f55, %r100;
268
+ $L__tmp22:
269
+ .loc 2 233 15
270
+ add.f32 %f56, %f54, %f55;
271
+ $L__tmp23:
272
+ .loc 2 243 36
273
+ mov.b32 %r101, %f56;
274
+ shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1;
275
+ mov.b32 %f57, %r102;
276
+ $L__tmp24:
277
+ .loc 2 233 15
278
+ add.f32 %f58, %f56, %f57;
279
+ $L__tmp25:
280
+ .loc 2 243 36
281
+ mov.b32 %r103, %f58;
282
+ shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1;
283
+ mov.b32 %f59, %r104;
284
+ $L__tmp26:
285
+ .loc 2 233 15
286
+ add.f32 %f60, %f58, %f59;
287
+ $L__tmp27:
288
+ .loc 2 243 36
289
+ mov.b32 %r105, %f60;
290
+ shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1;
291
+ mov.b32 %f61, %r106;
292
+ $L__tmp28:
293
+ .loc 2 233 15
294
+ add.f32 %f62, %f60, %f61;
295
+ $L__tmp29:
296
+ .loc 2 243 36
297
+ mov.b32 %r49, %f62;
298
+ @%p27 st.shared.b32 [ %r42 + 0 ], %r49;
299
+ bar.sync 0;
300
+ @%p28 ld.shared.b32 %r50, [ %r45 + 0 ];
301
+ mov.b32 %f63, %r50;
302
+ shfl.sync.bfly.b32 %r107, %r50, 1, 31, -1;
303
+ mov.b32 %f64, %r107;
304
+ $L__tmp30:
305
+ .loc 2 233 15
306
+ add.f32 %f65, %f63, %f64;
307
+ $L__tmp31:
308
+ .loc 2 243 36
309
+ mov.b32 %r53, %f65;
310
+ @%p29 st.shared.b32 [ %r45 + 0 ], %r53;
311
+ bar.sync 0;
312
+ ld.shared.f32 %f66, [global_smem];
313
+ $L__tmp32:
314
+ .loc 3 8 15
315
+ add.f32 %f67, %f66, 0f00000000;
316
+ mov.b32 %r56, 1132462080;
317
+ $L__tmp33:
318
+ .loc 1 48 20
319
+ div.full.f32 %r54, %r55, %r56;
320
+ mov.b32 %f68, %r54;
321
+ .loc 1 50 20
322
+ neg.f32 %f69, %f40;
323
+ fma.rn.f32 %f70, %f19, 0f43800000, %f69;
324
+ fma.rn.f32 %f71, %f20, 0f43800000, %f69;
325
+ fma.rn.f32 %f72, %f21, 0f43800000, %f69;
326
+ fma.rn.f32 %f73, %f22, 0f43800000, %f69;
327
+ .loc 1 52 20
328
+ neg.f32 %f74, %f45;
329
+ fma.rn.f32 %f75, %f74, %f67, %f70;
330
+ neg.f32 %f76, %f46;
331
+ fma.rn.f32 %f77, %f76, %f67, %f71;
332
+ neg.f32 %f78, %f47;
333
+ fma.rn.f32 %f79, %f78, %f67, %f72;
334
+ neg.f32 %f80, %f48;
335
+ fma.rn.f32 %f81, %f80, %f67, %f73;
336
+ .loc 1 54 20
337
+ fma.rn.f32 %f82, %f68, %f75, %f15;
338
+ fma.rn.f32 %f83, %f68, %f77, %f16;
339
+ fma.rn.f32 %f84, %f68, %f79, %f17;
340
+ fma.rn.f32 %f85, %f68, %f81, %f18;
341
+ .loc 1 56 51
342
+ mov.b32 %r66, %f82;
343
+ mov.b32 %r67, %f83;
344
+ mov.b32 %r68, %f84;
345
+ mov.b32 %r69, %f85;
346
+ @%p1 st.global.v4.b32 [ %rd12 + 0 ], { %r66, %r67, %r68, %r69 };
347
+ .loc 1 57 25
348
+ add.s64 %rd14, %rd21, %rd22;
349
+ .loc 1 57 48
350
+ cvt.rn.bf16.f32 %rs5, %r66;
351
+ cvt.rn.bf16.f32 %rs6, %r67;
352
+ cvt.rn.bf16.f32 %rs7, %r68;
353
+ cvt.rn.bf16.f32 %rs8, %r69;
354
+ mov.b32 %r108, {%rs5, %rs6};
355
+ mov.b32 %r109, {%rs7, %rs8};
356
+ @%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r108, %r109 };
357
+ .loc 1 57 4
358
+ ret;
359
+ $L__tmp34:
360
+ $L__func_end0:
361
+
362
+ }
363
+ .file 1 "/tmp/torchinductor_root/sn/csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py"
364
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
365
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
366
+ .section .debug_abbrev
367
+ {
368
+ .b8 1
369
+ .b8 17
370
+ .b8 1
371
+ .b8 37
372
+ .b8 8
373
+ .b8 19
374
+ .b8 5
375
+ .b8 3
376
+ .b8 8
377
+ .b8 16
378
+ .b8 6
379
+ .b8 27
380
+ .b8 8
381
+ .b8 180
382
+ .b8 66
383
+ .b8 12
384
+ .b8 17
385
+ .b8 1
386
+ .b8 18
387
+ .b8 1
388
+ .b8 0
389
+ .b8 0
390
+ .b8 2
391
+ .b8 46
392
+ .b8 0
393
+ .b8 135
394
+ .b8 64
395
+ .b8 8
396
+ .b8 3
397
+ .b8 8
398
+ .b8 58
399
+ .b8 11
400
+ .b8 59
401
+ .b8 11
402
+ .b8 63
403
+ .b8 12
404
+ .b8 32
405
+ .b8 11
406
+ .b8 0
407
+ .b8 0
408
+ .b8 3
409
+ .b8 46
410
+ .b8 1
411
+ .b8 17
412
+ .b8 1
413
+ .b8 18
414
+ .b8 1
415
+ .b8 64
416
+ .b8 10
417
+ .b8 49
418
+ .b8 19
419
+ .b8 0
420
+ .b8 0
421
+ .b8 4
422
+ .b8 29
423
+ .b8 1
424
+ .b8 49
425
+ .b8 19
426
+ .b8 17
427
+ .b8 1
428
+ .b8 18
429
+ .b8 1
430
+ .b8 88
431
+ .b8 11
432
+ .b8 89
433
+ .b8 11
434
+ .b8 87
435
+ .b8 11
436
+ .b8 0
437
+ .b8 0
438
+ .b8 5
439
+ .b8 29
440
+ .b8 0
441
+ .b8 49
442
+ .b8 19
443
+ .b8 17
444
+ .b8 1
445
+ .b8 18
446
+ .b8 1
447
+ .b8 88
448
+ .b8 11
449
+ .b8 89
450
+ .b8 11
451
+ .b8 87
452
+ .b8 11
453
+ .b8 0
454
+ .b8 0
455
+ .b8 0
456
+ }
457
+ .section .debug_info
458
+ {
459
+ .b32 403
460
+ .b8 2
461
+ .b8 0
462
+ .b32 .debug_abbrev
463
+ .b8 8
464
+ .b8 1
465
+ .b8 116
466
+ .b8 114
467
+ .b8 105
468
+ .b8 116
469
+ .b8 111
470
+ .b8 110
471
+ .b8 0
472
+ .b8 2
473
+ .b8 0
474
+ .b8 99
475
+ .b8 115
476
+ .b8 110
477
+ .b8 101
478
+ .b8 100
479
+ .b8 52
480
+ .b8 104
481
+ .b8 121
482
+ .b8 120
483
+ .b8 112
484
+ .b8 103
485
+ .b8 119
486
+ .b8 117
487
+ .b8 53
488
+ .b8 116
489
+ .b8 116
490
+ .b8 117
491
+ .b8 98
492
+ .b8 115
493
+ .b8 51
494
+ .b8 114
495
+ .b8 55
496
+ .b8 117
497
+ .b8 120
498
+ .b8 107
499
+ .b8 106
500
+ .b8 113
501
+ .b8 53
502
+ .b8 121
503
+ .b8 102
504
+ .b8 108
505
+ .b8 51
506
+ .b8 122
507
+ .b8 104
508
+ .b8 54
509
+ .b8 99
510
+ .b8 50
511
+ .b8 115
512
+ .b8 111
513
+ .b8 122
514
+ .b8 111
515
+ .b8 98
516
+ .b8 116
517
+ .b8 107
518
+ .b8 101
519
+ .b8 107
520
+ .b8 50
521
+ .b8 117
522
+ .b8 122
523
+ .b8 102
524
+ .b8 99
525
+ .b8 118
526
+ .b8 46
527
+ .b8 112
528
+ .b8 121
529
+ .b8 0
530
+ .b32 .debug_line
531
+ .b8 47
532
+ .b8 116
533
+ .b8 109
534
+ .b8 112
535
+ .b8 47
536
+ .b8 116
537
+ .b8 111
538
+ .b8 114
539
+ .b8 99
540
+ .b8 104
541
+ .b8 105
542
+ .b8 110
543
+ .b8 100
544
+ .b8 117
545
+ .b8 99
546
+ .b8 116
547
+ .b8 111
548
+ .b8 114
549
+ .b8 95
550
+ .b8 114
551
+ .b8 111
552
+ .b8 111
553
+ .b8 116
554
+ .b8 47
555
+ .b8 115
556
+ .b8 110
557
+ .b8 0
558
+ .b8 1
559
+ .b64 $L__func_begin0
560
+ .b64 $L__func_end0
561
+ .b8 2
562
+ .b8 116
563
+ .b8 114
564
+ .b8 105
565
+ .b8 116
566
+ .b8 111
567
+ .b8 110
568
+ .b8 95
569
+ .b8 95
570
+ .b8 48
571
+ .b8 100
572
+ .b8 49
573
+ .b8 100
574
+ .b8 50
575
+ .b8 100
576
+ .b8 51
577
+ .b8 100
578
+ .b8 52
579
+ .b8 100
580
+ .b8 53
581
+ .b8 100
582
+ .b8 54
583
+ .b8 100
584
+ .b8 55
585
+ .b8 100
586
+ .b8 101
587
+ .b8 56
588
+ .b8 100
589
+ .b8 101
590
+ .b8 0
591
+ .b8 116
592
+ .b8 114
593
+ .b8 105
594
+ .b8 116
595
+ .b8 111
596
+ .b8 110
597
+ .b8 95
598
+ .b8 95
599
+ .b8 48
600
+ .b8 100
601
+ .b8 49
602
+ .b8 100
603
+ .b8 50
604
+ .b8 100
605
+ .b8 51
606
+ .b8 100
607
+ .b8 52
608
+ .b8 100
609
+ .b8 53
610
+ .b8 100
611
+ .b8 54
612
+ .b8 100
613
+ .b8 55
614
+ .b8 100
615
+ .b8 101
616
+ .b8 56
617
+ .b8 100
618
+ .b8 101
619
+ .b8 0
620
+ .b8 1
621
+ .b8 18
622
+ .b8 1
623
+ .b8 1
624
+ .b8 3
625
+ .b64 $L__func_begin0
626
+ .b64 $L__func_end0
627
+ .b8 1
628
+ .b8 156
629
+ .b32 125
630
+ .b8 4
631
+ .b32 125
632
+ .b64 $L__tmp1
633
+ .b64 $L__tmp14
634
+ .b8 2
635
+ .b8 40
636
+ .b8 57
637
+ .b8 5
638
+ .b32 125
639
+ .b64 $L__tmp1
640
+ .b64 $L__tmp14
641
+ .b8 2
642
+ .b8 243
643
+ .b8 36
644
+ .b8 0
645
+ .b8 5
646
+ .b32 125
647
+ .b64 $L__tmp2
648
+ .b64 $L__tmp15
649
+ .b8 2
650
+ .b8 40
651
+ .b8 57
652
+ .b8 5
653
+ .b32 125
654
+ .b64 $L__tmp15
655
+ .b64 $L__tmp16
656
+ .b8 3
657
+ .b8 40
658
+ .b8 44
659
+ .b8 5
660
+ .b32 125
661
+ .b64 $L__tmp17
662
+ .b64 $L__tmp32
663
+ .b8 2
664
+ .b8 46
665
+ .b8 59
666
+ .b8 4
667
+ .b32 125
668
+ .b64 $L__tmp18
669
+ .b64 $L__tmp31
670
+ .b8 2
671
+ .b8 46
672
+ .b8 59
673
+ .b8 5
674
+ .b32 125
675
+ .b64 $L__tmp18
676
+ .b64 $L__tmp31
677
+ .b8 2
678
+ .b8 243
679
+ .b8 36
680
+ .b8 0
681
+ .b8 5
682
+ .b32 125
683
+ .b64 $L__tmp32
684
+ .b64 $L__tmp33
685
+ .b8 3
686
+ .b8 46
687
+ .b8 45
688
+ .b8 0
689
+ .b8 0
690
+ }
691
+ .section .debug_pubnames
692
+ {
693
+ .b32 $L__pubNames_end0-$L__pubNames_start0
694
+ $L__pubNames_start0:
695
+ .b8 2
696
+ .b8 0
697
+ .b32 .debug_info
698
+ .b32 407
699
+ .b32 125
700
+ .b8 116
701
+ .b8 114
702
+ .b8 105
703
+ .b8 116
704
+ .b8 111
705
+ .b8 110
706
+ .b8 95
707
+ .b8 95
708
+ .b8 48
709
+ .b8 100
710
+ .b8 49
711
+ .b8 100
712
+ .b8 50
713
+ .b8 100
714
+ .b8 51
715
+ .b8 100
716
+ .b8 52
717
+ .b8 100
718
+ .b8 53
719
+ .b8 100
720
+ .b8 54
721
+ .b8 100
722
+ .b8 55
723
+ .b8 100
724
+ .b8 101
725
+ .b8 56
726
+ .b8 100
727
+ .b8 101
728
+ .b8 0
729
+ .b32 0
730
+ $L__pubNames_end0:
731
+ }
732
+ .section .debug_pubtypes
733
+ {
734
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
735
+ $L__pubTypes_start0:
736
+ .b8 2
737
+ .b8 0
738
+ .b32 .debug_info
739
+ .b32 407
740
+ .b32 0
741
+ $L__pubTypes_end0:
742
+ }
743
+ .section .debug_loc { }
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32>
7
+ %cst_2 = arith.constant dense<2.560000e+02> : tensor<256xf32>
8
+ %cst_3 = arith.constant dense<2.560000e+02> : tensor<1xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
19
+ %9 = arith.extf %8 : tensor<256xbf16> to tensor<256xf32>
20
+ %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
21
+ %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
22
+ %12 = tt.load %11, %2, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
23
+ %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
24
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
26
+ %16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
27
+ %17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
28
+ %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
29
+ %19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
30
+ %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
31
+ %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
32
+ %22 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
33
+ %23 = tt.addptr %22, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
34
+ %24 = tt.load %23, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
35
+ %25 = arith.mulf %9, %12 : tensor<256xf32>
36
+ %26 = arith.select %2, %25, %cst_1 : tensor<256xi1>, tensor<256xf32>
37
+ %27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({
38
+ ^bb0(%arg9: f32, %arg10: f32):
39
+ %50 = arith.addf %arg9, %arg10 : f32
40
+ tt.reduce.return %50 : f32
41
+ }) : (tensor<256xf32>) -> f32
42
+ %28 = arith.addf %27, %cst_0 : f32
43
+ %29 = tt.broadcast %18 : (tensor<1xf32>) -> tensor<256xf32>
44
+ %30 = arith.subf %15, %29 : tensor<256xf32>
45
+ %31 = tt.broadcast %21 : (tensor<1xf32>) -> tensor<256xf32>
46
+ %32 = arith.mulf %30, %31 : tensor<256xf32>
47
+ %33 = arith.mulf %25, %32 : tensor<256xf32>
48
+ %34 = arith.select %2, %33, %cst_1 : tensor<256xi1>, tensor<256xf32>
49
+ %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
50
+ ^bb0(%arg9: f32, %arg10: f32):
51
+ %50 = arith.addf %arg9, %arg10 : f32
52
+ tt.reduce.return %50 : f32
53
+ }) : (tensor<256xf32>) -> f32
54
+ %36 = arith.addf %35, %cst_0 : f32
55
+ %37 = arith.divf %21, %cst_3 : tensor<1xf32>
56
+ %38 = arith.mulf %25, %cst_2 : tensor<256xf32>
57
+ %39 = tt.splat %28 : (f32) -> tensor<256xf32>
58
+ %40 = arith.subf %38, %39 : tensor<256xf32>
59
+ %41 = tt.splat %36 : (f32) -> tensor<256xf32>
60
+ %42 = arith.mulf %32, %41 : tensor<256xf32>
61
+ %43 = arith.subf %40, %42 : tensor<256xf32>
62
+ %44 = tt.broadcast %37 : (tensor<1xf32>) -> tensor<256xf32>
63
+ %45 = arith.mulf %44, %43 : tensor<256xf32>
64
+ %46 = arith.addf %24, %45 : tensor<256xf32>
65
+ tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
66
+ %47 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
67
+ %48 = tt.addptr %47, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
68
+ %49 = arith.truncf %46 : tensor<256xf32> to tensor<256xbf16>
69
+ tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
70
+ tt.return
71
+ }
72
+ }
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin ADDED
Binary file (10.3 kB). View file
 
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 63, !dbg !8
9
+ %8 = lshr i32 %6, 6, !dbg !9
10
+ %9 = and i32 %8, 3, !dbg !9
11
+ %10 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
12
+ %11 = shl i32 %10, 6, !dbg !11
13
+ %12 = or i32 %11, %7, !dbg !12
14
+ br label %13, !dbg !13
15
+
16
+ 13: ; preds = %5, %13
17
+ %14 = phi float [ 0.000000e+00, %5 ], [ %23, %13 ]
18
+ %15 = phi i32 [ 0, %5 ], [ %24, %13 ]
19
+ %16 = or i32 %15, %9, !dbg !14
20
+ %17 = shl i32 %16, 17, !dbg !15
21
+ %18 = add i32 %17, %12, !dbg !16
22
+ %19 = sext i32 %18 to i64, !dbg !17
23
+ %20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !17
24
+ %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true) #3, !dbg !18
25
+ %22 = bitcast i32 %21 to float, !dbg !18
26
+ %23 = fadd float %14, %22, !dbg !19
27
+ %24 = add nuw nsw i32 %15, 4, !dbg !13
28
+ %25 = icmp ult i32 %15, 116, !dbg !13
29
+ br i1 %25, label %13, label %26, !dbg !13
30
+
31
+ 26: ; preds = %13
32
+ %27 = shl nuw nsw i32 %7, 2, !dbg !20
33
+ %28 = or i32 %27, %9, !dbg !20
34
+ %29 = zext nneg i32 %28 to i64, !dbg !20
35
+ %30 = getelementptr float, ptr addrspace(3) @global_smem, i64 %29, !dbg !20
36
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %30, float %23, i1 true) #3, !dbg !20
37
+ tail call void @llvm.nvvm.barrier0(), !dbg !20
38
+ %31 = icmp slt i32 %6, 256, !dbg !20
39
+ %32 = sext i32 %6 to i64, !dbg !20
40
+ %33 = getelementptr float, ptr addrspace(3) @global_smem, i64 %32, !dbg !20
41
+ %34 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %33, i1 %31) #3, !dbg !20
42
+ %35 = bitcast float %34 to i32, !dbg !20
43
+ %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !20
44
+ %37 = bitcast i32 %36 to float, !dbg !20
45
+ %38 = fadd float %34, %37, !dbg !24
46
+ %39 = bitcast float %38 to i32, !dbg !20
47
+ %40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !20
48
+ %41 = bitcast i32 %40 to float, !dbg !20
49
+ %42 = fadd float %38, %41, !dbg !24
50
+ %43 = and i32 %6, 3, !dbg !20
51
+ %44 = icmp eq i32 %43, 0, !dbg !20
52
+ %45 = and i1 %31, %44, !dbg !20
53
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %33, float %42, i1 %45) #3, !dbg !20
54
+ tail call void @llvm.nvvm.barrier0(), !dbg !20
55
+ %46 = zext nneg i32 %27 to i64, !dbg !20
56
+ %47 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46, !dbg !20
57
+ %48 = load float, ptr addrspace(3) %47, align 4, !dbg !20
58
+ %.frozen = freeze i32 %12
59
+ %49 = sdiv i32 %.frozen, 256, !dbg !28
60
+ %50 = mul i32 %49, 256
61
+ %.decomposed = sub i32 %.frozen, %50
62
+ %51 = sext i32 %49 to i64, !dbg !29
63
+ %52 = getelementptr i64, ptr addrspace(1) %1, i64 %51, !dbg !29
64
+ %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %52, i1 true) #3, !dbg !30
65
+ %54 = lshr i64 %53, 54, !dbg !31
66
+ %55 = and i64 %54, 512, !dbg !31
67
+ %56 = add i64 %55, %53, !dbg !31
68
+ %57 = shl i64 %56, 8, !dbg !32
69
+ %58 = sext i32 %.decomposed to i64, !dbg !33
70
+ %59 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !34
71
+ %60 = getelementptr float, ptr addrspace(1) %59, i64 %58, !dbg !34
72
+ %61 = icmp eq i32 %9, 0, !dbg !35
73
+ %62 = insertelement <1 x float> undef, float %48, i64 0, !dbg !35
74
+ %63 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %60, <1 x float> %62, i1 %61) #3, !dbg !35
75
+ ret void, !dbg !36
76
+ }
77
+
78
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
79
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
80
+
81
+ ; Function Attrs: convergent nocallback nounwind
82
+ declare void @llvm.nvvm.barrier0() #1
83
+
84
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
85
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
86
+
87
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
88
+ attributes #1 = { convergent nocallback nounwind }
89
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
90
+ attributes #3 = { nounwind }
91
+
92
+ !llvm.module.flags = !{!0}
93
+ !llvm.dbg.cu = !{!1}
94
+ !nvvm.annotations = !{!3, !4, !4, !3}
95
+
96
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
97
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
98
+ !2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
99
+ !3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
100
+ !4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
101
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
102
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
103
+ !7 = !{}
104
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
105
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
106
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
107
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
108
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
109
+ !13 = !DILocation(line: 27, column: 36, scope: !5)
110
+ !14 = !DILocation(line: 28, column: 27, scope: !5)
111
+ !15 = !DILocation(line: 31, column: 47, scope: !5)
112
+ !16 = !DILocation(line: 31, column: 40, scope: !5)
113
+ !17 = !DILocation(line: 31, column: 34, scope: !5)
114
+ !18 = !DILocation(line: 31, column: 53, scope: !5)
115
+ !19 = !DILocation(line: 34, column: 38, scope: !5)
116
+ !20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
117
+ !21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
118
+ !22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
119
+ !23 = !DILocation(line: 35, column: 25, scope: !21)
120
+ !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
121
+ !25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
122
+ !26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
123
+ !27 = !DILocation(line: 35, column: 25, scope: !25)
124
+ !28 = !DILocation(line: 36, column: 20, scope: !5)
125
+ !29 = !DILocation(line: 38, column: 30, scope: !5)
126
+ !30 = !DILocation(line: 38, column: 35, scope: !5)
127
+ !31 = !DILocation(line: 41, column: 32, scope: !5)
128
+ !32 = !DILocation(line: 45, column: 40, scope: !5)
129
+ !33 = !DILocation(line: 45, column: 36, scope: !5)
130
+ !34 = !DILocation(line: 45, column: 30, scope: !5)
131
+ !35 = !DILocation(line: 45, column: 55, scope: !5)
132
+ !36 = !DILocation(line: 45, column: 4, scope: !5)
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
6
+
7
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
8
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
9
+ %9 = and i32 %8, 31, !dbg !10
10
+ %10 = lshr i32 %8, 5, !dbg !10
11
+ %11 = and i32 %10, 1, !dbg !10
12
+ %urem = shl i32 %8, 2, !dbg !10
13
+ %12 = and i32 %urem, 252, !dbg !10
14
+ %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
15
+ %14 = shl i32 %13, 8, !dbg !12
16
+ %15 = or i32 %14, %12, !dbg !13
17
+ %16 = sext i32 %15 to i64, !dbg !14
18
+ %17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !14
19
+ %18 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %17, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
20
+ %19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !15
21
+ %20 = extractvalue { i32, i32, i32, i32 } %18, 1, !dbg !15
22
+ %21 = extractvalue { i32, i32, i32, i32 } %18, 2, !dbg !15
23
+ %22 = extractvalue { i32, i32, i32, i32 } %18, 3, !dbg !15
24
+ %23 = bitcast i32 %21 to float, !dbg !15
25
+ %24 = bitcast i32 %22 to float, !dbg !15
26
+ %25 = getelementptr i16, ptr addrspace(1) %1, i64 %16, !dbg !16
27
+ %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
28
+ %27 = extractvalue { i32, i32 } %26, 0, !dbg !17
29
+ %28 = extractvalue { i32, i32 } %26, 1, !dbg !17
30
+ %29 = trunc i32 %27 to i16, !dbg !17
31
+ %extelt.offset = lshr i32 %27, 16, !dbg !17
32
+ %30 = trunc i32 %extelt.offset to i16, !dbg !17
33
+ %31 = trunc i32 %28 to i16, !dbg !17
34
+ %extelt.offset1 = lshr i32 %28, 16, !dbg !17
35
+ %32 = trunc i32 %extelt.offset1 to i16, !dbg !17
36
+ %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #6, !dbg !18
37
+ %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
38
+ %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
39
+ %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
40
+ %37 = getelementptr i16, ptr addrspace(1) %2, i64 %16, !dbg !19
41
+ %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
42
+ %39 = extractvalue { i32, i32 } %38, 0, !dbg !20
43
+ %40 = extractvalue { i32, i32 } %38, 1, !dbg !20
44
+ %41 = trunc i32 %39 to i16, !dbg !20
45
+ %extelt.offset2 = lshr i32 %39, 16, !dbg !20
46
+ %42 = trunc i32 %extelt.offset2 to i16, !dbg !20
47
+ %43 = trunc i32 %40 to i16, !dbg !20
48
+ %extelt.offset3 = lshr i32 %40, 16, !dbg !20
49
+ %44 = trunc i32 %extelt.offset3 to i16, !dbg !20
50
+ %45 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #6, !dbg !21
51
+ %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
52
+ %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
53
+ %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
54
+ %49 = zext nneg i32 %12 to i64, !dbg !22
55
+ %50 = getelementptr float, ptr addrspace(1) %3, i64 %49, !dbg !22
56
+ %51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
57
+ %52 = fadd float %35, %23, !dbg !24
58
+ %53 = fadd float %36, %24, !dbg !24
59
+ %54 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !15
60
+ %55 = insertelement <2 x i32> %54, i32 %20, i64 1, !dbg !15
61
+ %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !15
62
+ %57 = insertelement <2 x float> poison, float %33, i64 0, !dbg !24
63
+ %58 = insertelement <2 x float> %57, float %34, i64 1, !dbg !24
64
+ %59 = fadd <2 x float> %58, %56, !dbg !24
65
+ %60 = insertelement <2 x float> poison, float %45, i64 0, !dbg !25
66
+ %61 = insertelement <2 x float> %60, float %46, i64 1, !dbg !25
67
+ %62 = fadd <2 x float> %59, %61, !dbg !25
68
+ %63 = fadd float %52, %47, !dbg !25
69
+ %64 = fadd float %53, %48, !dbg !25
70
+ %65 = extractelement <2 x float> %62, i64 0, !dbg !26
71
+ %66 = extractelement <2 x float> %62, i64 1, !dbg !26
72
+ %67 = fadd float %65, %66, !dbg !26
73
+ %68 = fadd float %67, %63, !dbg !26
74
+ %69 = fadd float %68, %64, !dbg !26
75
+ %70 = bitcast float %69 to i32, !dbg !32
76
+ %71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32
77
+ %72 = bitcast i32 %71 to float, !dbg !32
78
+ %73 = fadd float %69, %72, !dbg !26
79
+ %74 = bitcast float %73 to i32, !dbg !32
80
+ %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32
81
+ %76 = bitcast i32 %75 to float, !dbg !32
82
+ %77 = fadd float %73, %76, !dbg !26
83
+ %78 = bitcast float %77 to i32, !dbg !32
84
+ %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32
85
+ %80 = bitcast i32 %79 to float, !dbg !32
86
+ %81 = fadd float %77, %80, !dbg !26
87
+ %82 = bitcast float %81 to i32, !dbg !32
88
+ %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32
89
+ %84 = bitcast i32 %83 to float, !dbg !32
90
+ %85 = fadd float %81, %84, !dbg !26
91
+ %86 = bitcast float %85 to i32, !dbg !32
92
+ %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32
93
+ %88 = bitcast i32 %87 to float, !dbg !32
94
+ %89 = fadd float %85, %88, !dbg !26
95
+ %90 = icmp eq i32 %9, 0, !dbg !32
96
+ %91 = zext nneg i32 %11 to i64, !dbg !32
97
+ %92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32
98
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #6, !dbg !32
99
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
100
+ %93 = icmp slt i32 %8, 2, !dbg !32
101
+ %94 = sext i32 %8 to i64, !dbg !32
102
+ %95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32
103
+ %96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !32
104
+ %97 = bitcast float %96 to i32, !dbg !32
105
+ %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 1, i32 31), !dbg !32
106
+ %99 = bitcast i32 %98 to float, !dbg !32
107
+ %100 = fadd float %96, %99, !dbg !26
108
+ %101 = and i32 %8, 1, !dbg !32
109
+ %102 = icmp eq i32 %101, 0, !dbg !32
110
+ %103 = and i1 %93, %102, !dbg !32
111
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %100, i1 %103) #6, !dbg !32
112
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
113
+ %104 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
114
+ %105 = fadd float %104, 0.000000e+00, !dbg !34
115
+ %106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %105, float 2.560000e+02) #6, !dbg !38
116
+ %107 = fsub float %65, %106, !dbg !39
117
+ %108 = fsub float %66, %106, !dbg !39
118
+ %109 = fsub float %63, %106, !dbg !39
119
+ %110 = fsub float %64, %106, !dbg !39
120
+ %111 = fmul float %107, %107, !dbg !40
121
+ %112 = fmul float %108, %108, !dbg !40
122
+ %113 = fmul float %109, %109, !dbg !40
123
+ %114 = fmul float %110, %110, !dbg !40
124
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
125
+ %115 = fadd float %111, %112, !dbg !43
126
+ %116 = fadd float %113, %115, !dbg !43
127
+ %117 = fadd float %114, %116, !dbg !43
128
+ %118 = bitcast float %117 to i32, !dbg !41
129
+ %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 16, i32 31), !dbg !41
130
+ %120 = bitcast i32 %119 to float, !dbg !41
131
+ %121 = fadd float %117, %120, !dbg !43
132
+ %122 = bitcast float %121 to i32, !dbg !41
133
+ %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 8, i32 31), !dbg !41
134
+ %124 = bitcast i32 %123 to float, !dbg !41
135
+ %125 = fadd float %121, %124, !dbg !43
136
+ %126 = bitcast float %125 to i32, !dbg !41
137
+ %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 4, i32 31), !dbg !41
138
+ %128 = bitcast i32 %127 to float, !dbg !41
139
+ %129 = fadd float %125, %128, !dbg !43
140
+ %130 = bitcast float %129 to i32, !dbg !41
141
+ %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !41
142
+ %132 = bitcast i32 %131 to float, !dbg !41
143
+ %133 = fadd float %129, %132, !dbg !43
144
+ %134 = bitcast float %133 to i32, !dbg !41
145
+ %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 1, i32 31), !dbg !41
146
+ %136 = bitcast i32 %135 to float, !dbg !41
147
+ %137 = fadd float %133, %136, !dbg !43
148
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %137, i1 %90) #6, !dbg !41
149
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
150
+ %138 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !41
151
+ %139 = bitcast float %138 to i32, !dbg !41
152
+ %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !41
153
+ %141 = bitcast i32 %140 to float, !dbg !41
154
+ %142 = fadd float %138, %141, !dbg !43
155
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %142, i1 %103) #6, !dbg !41
156
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
157
+ %143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
158
+ %144 = fadd float %143, 0.000000e+00, !dbg !46
159
+ %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !48
160
+ %146 = fadd float %145, 0x3EE4F8B580000000, !dbg !49
161
+ %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50
162
+ %.not.i = icmp eq i32 %147, 0, !dbg !50
163
+ br i1 %.not.i, label %150, label %148, !dbg !50
164
+
165
+ 148: ; preds = %7
166
+ %149 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %146), !dbg !50
167
+ br label %__nv_rsqrtf.exit, !dbg !50
168
+
169
+ 150: ; preds = %7
170
+ %151 = tail call float @llvm.nvvm.rsqrt.approx.f(float %146), !dbg !50
171
+ br label %__nv_rsqrtf.exit, !dbg !50
172
+
173
+ __nv_rsqrtf.exit: ; preds = %148, %150
174
+ %.0.i = phi float [ %149, %148 ], [ %151, %150 ], !dbg !50
175
+ %152 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !23
176
+ %153 = bitcast i32 %152 to float, !dbg !23
177
+ %154 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !23
178
+ %155 = bitcast i32 %154 to float, !dbg !23
179
+ %156 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !23
180
+ %157 = bitcast i32 %156 to float, !dbg !23
181
+ %158 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !23
182
+ %159 = bitcast i32 %158 to float, !dbg !23
183
+ %160 = fmul float %107, %.0.i, !dbg !51
184
+ %161 = fmul float %108, %.0.i, !dbg !51
185
+ %162 = fmul float %109, %.0.i, !dbg !51
186
+ %163 = fmul float %110, %.0.i, !dbg !51
187
+ %164 = fmul float %160, %159, !dbg !52
188
+ %165 = fmul float %161, %157, !dbg !52
189
+ %166 = fmul float %162, %155, !dbg !52
190
+ %167 = fmul float %163, %153, !dbg !52
191
+ %168 = getelementptr float, ptr addrspace(1) %4, i64 %16, !dbg !53
192
+ %169 = bitcast float %164 to i32, !dbg !54
193
+ %170 = bitcast float %165 to i32, !dbg !54
194
+ %171 = bitcast float %166 to i32, !dbg !54
195
+ %172 = bitcast float %167 to i32, !dbg !54
196
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %169, i32 %170, i32 %171, i32 %172, ptr addrspace(1) %168, i1 true) #6, !dbg !54
197
+ ret void, !dbg !55
198
+ }
199
+
200
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
201
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
202
+
203
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
204
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
205
+
206
+ ; Function Attrs: convergent nocallback nounwind
207
+ declare void @llvm.nvvm.barrier0() #2
208
+
209
+ ; Function Attrs: alwaysinline nounwind
210
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
211
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
212
+ %.not = icmp eq i32 %1, 0
213
+ br i1 %.not, label %4, label %2
214
+
215
+ 2: ; preds = %0
216
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
217
+ br label %6
218
+
219
+ 4: ; preds = %0
220
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
221
+ br label %6
222
+
223
+ 6: ; preds = %4, %2
224
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
225
+ ret float %.0
226
+ }
227
+
228
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
229
+
230
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
231
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
232
+
233
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
234
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
235
+
236
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
237
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
238
+ attributes #2 = { convergent nocallback nounwind }
239
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
240
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
241
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
242
+ attributes #6 = { nounwind }
243
+
244
+ !llvm.module.flags = !{!0, !1}
245
+ !llvm.dbg.cu = !{!2}
246
+ !nvvm.annotations = !{!4, !5, !5, !4}
247
+ !llvm.ident = !{!6}
248
+
249
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
250
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
251
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
252
+ !3 = !DIFile(filename: "cpedrbcgvftrmo3x6vfpo6dhkxbweq3ucfj5jibyyvr3hf67gsvx.py", directory: "/tmp/torchinductor_root/pe")
253
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
254
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64}
255
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
256
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
257
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
258
+ !9 = !{}
259
+ !10 = !DILocation(line: 26, column: 26, scope: !7)
260
+ !11 = !DILocation(line: 23, column: 28, scope: !7)
261
+ !12 = !DILocation(line: 30, column: 40, scope: !7)
262
+ !13 = !DILocation(line: 30, column: 36, scope: !7)
263
+ !14 = !DILocation(line: 30, column: 30, scope: !7)
264
+ !15 = !DILocation(line: 30, column: 46, scope: !7)
265
+ !16 = !DILocation(line: 31, column: 30, scope: !7)
266
+ !17 = !DILocation(line: 31, column: 46, scope: !7)
267
+ !18 = !DILocation(line: 31, column: 67, scope: !7)
268
+ !19 = !DILocation(line: 32, column: 30, scope: !7)
269
+ !20 = !DILocation(line: 32, column: 46, scope: !7)
270
+ !21 = !DILocation(line: 32, column: 67, scope: !7)
271
+ !22 = !DILocation(line: 33, column: 31, scope: !7)
272
+ !23 = !DILocation(line: 33, column: 36, scope: !7)
273
+ !24 = !DILocation(line: 35, column: 18, scope: !7)
274
+ !25 = !DILocation(line: 37, column: 18, scope: !7)
275
+ !26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
276
+ !27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
277
+ !28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
278
+ !29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0)
279
+ !30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
280
+ !31 = !DILocation(line: 42, column: 59, scope: !27)
281
+ !32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
282
+ !33 = !DILocation(line: 42, column: 59, scope: !29)
283
+ !34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
284
+ !35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
285
+ !36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
286
+ !37 = !DILocation(line: 42, column: 45, scope: !35)
287
+ !38 = !DILocation(line: 45, column: 20, scope: !7)
288
+ !39 = !DILocation(line: 46, column: 19, scope: !7)
289
+ !40 = !DILocation(line: 47, column: 20, scope: !7)
290
+ !41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
291
+ !42 = !DILocation(line: 50, column: 59, scope: !29)
292
+ !43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
293
+ !44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
294
+ !45 = !DILocation(line: 50, column: 59, scope: !27)
295
+ !46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
296
+ !47 = !DILocation(line: 50, column: 45, scope: !35)
297
+ !48 = !DILocation(line: 53, column: 20, scope: !7)
298
+ !49 = !DILocation(line: 55, column: 20, scope: !7)
299
+ !50 = !DILocation(line: 56, column: 26, scope: !7)
300
+ !51 = !DILocation(line: 57, column: 20, scope: !7)
301
+ !52 = !DILocation(line: 58, column: 20, scope: !7)
302
+ !53 = !DILocation(line: 59, column: 25, scope: !7)
303
+ !54 = !DILocation(line: 59, column: 48, scope: !7)
304
+ !55 = !DILocation(line: 59, column: 4, scope: !7)
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant 9.99999974E-6 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 0.000000e+00 : f32
8
+ %c256_i32 = arith.constant 256 : i32
9
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
20
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
21
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
22
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
23
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
27
+ %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
28
+ %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
29
+ %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
30
+ %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
31
+ %20 = arith.addf %8, %12 : tensor<256xf32, #blocked>
32
+ %21 = arith.addf %20, %16 : tensor<256xf32, #blocked>
33
+ %22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
34
+ %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
35
+ ^bb0(%arg7: f32, %arg8: f32):
36
+ %40 = arith.addf %arg7, %arg8 : f32
37
+ tt.reduce.return %40 : f32
38
+ }) : (tensor<256xf32, #blocked>) -> f32
39
+ %24 = arith.addf %23, %cst_2 : f32
40
+ %25 = arith.divf %24, %cst_1 : f32
41
+ %26 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
42
+ %27 = arith.subf %21, %26 : tensor<256xf32, #blocked>
43
+ %28 = arith.mulf %27, %27 : tensor<256xf32, #blocked>
44
+ %29 = arith.select %2, %28, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
45
+ %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
46
+ ^bb0(%arg7: f32, %arg8: f32):
47
+ %40 = arith.addf %arg7, %arg8 : f32
48
+ tt.reduce.return %40 : f32
49
+ }) : (tensor<256xf32, #blocked>) -> f32
50
+ %31 = arith.addf %30, %cst_2 : f32
51
+ %32 = arith.divf %31, %cst_1 : f32
52
+ %33 = arith.addf %32, %cst_0 : f32
53
+ %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
54
+ %35 = tt.splat %34 : (f32) -> tensor<256xf32, #blocked>
55
+ %36 = arith.mulf %27, %35 : tensor<256xf32, #blocked>
56
+ %37 = arith.mulf %36, %19 : tensor<256xf32, #blocked>
57
+ %38 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
58
+ %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
59
+ tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
60
+ tt.return
61
+ }
62
+ }
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 9.99999974E-6 : f32
8
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
20
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
21
+ %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
22
+ %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
23
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
24
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
26
+ %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
27
+ %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
28
+ %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
29
+ %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
30
+ %20 = arith.addf %8, %12 : tensor<256xf32>
31
+ %21 = arith.addf %20, %16 : tensor<256xf32>
32
+ %22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
33
+ %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
34
+ ^bb0(%arg7: f32, %arg8: f32):
35
+ %40 = arith.addf %arg7, %arg8 : f32
36
+ tt.reduce.return %40 : f32
37
+ }) : (tensor<256xf32>) -> f32
38
+ %24 = arith.addf %23, %cst_0 : f32
39
+ %25 = arith.divf %24, %cst_1 : f32
40
+ %26 = tt.splat %25 : (f32) -> tensor<256xf32>
41
+ %27 = arith.subf %21, %26 : tensor<256xf32>
42
+ %28 = arith.mulf %27, %27 : tensor<256xf32>
43
+ %29 = arith.select %2, %28, %cst_3 : tensor<256xi1>, tensor<256xf32>
44
+ %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
45
+ ^bb0(%arg7: f32, %arg8: f32):
46
+ %40 = arith.addf %arg7, %arg8 : f32
47
+ tt.reduce.return %40 : f32
48
+ }) : (tensor<256xf32>) -> f32
49
+ %31 = arith.addf %30, %cst_0 : f32
50
+ %32 = arith.divf %31, %cst_1 : f32
51
+ %33 = arith.addf %32, %cst_2 : f32
52
+ %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
53
+ %35 = tt.splat %34 : (f32) -> tensor<256xf32>
54
+ %36 = arith.mulf %27, %35 : tensor<256xf32>
55
+ %37 = arith.mulf %36, %19 : tensor<256xf32>
56
+ %38 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
57
+ %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
58
+ tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
59
+ tt.return
60
+ }
61
+ }
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.llir ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2d3d4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4) local_unnamed_addr !dbg !5 {
5
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %7 = shl i32 %6, 3, !dbg !8
7
+ %8 = and i32 %7, 1016, !dbg !8
8
+ %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %10 = shl i32 %9, 10, !dbg !10
10
+ %11 = or i32 %10, %8, !dbg !11
11
+ %.frozen = freeze i32 %11
12
+ %12 = sdiv i32 %.frozen, 256, !dbg !12
13
+ %13 = srem i32 %12, 3, !dbg !13
14
+ %14 = mul i32 %12, 256
15
+ %.decomposed = sub i32 %.frozen, %14
16
+ %15 = sdiv i32 %11, 768, !dbg !14
17
+ %16 = shl nsw i32 %15, 8, !dbg !15
18
+ %17 = add nsw i32 %16, %.decomposed, !dbg !16
19
+ %18 = sext i32 %17 to i64, !dbg !17
20
+ %19 = getelementptr i16, ptr addrspace(1) %0, i64 %18, !dbg !17
21
+ %20 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %19, i1 true) #1, !dbg !18
22
+ %21 = extractvalue { i32, i32, i32, i32 } %20, 0, !dbg !18
23
+ %22 = extractvalue { i32, i32, i32, i32 } %20, 1, !dbg !18
24
+ %23 = extractvalue { i32, i32, i32, i32 } %20, 2, !dbg !18
25
+ %24 = extractvalue { i32, i32, i32, i32 } %20, 3, !dbg !18
26
+ %25 = trunc i32 %21 to i16, !dbg !18
27
+ %extelt.offset = lshr i32 %21, 16, !dbg !18
28
+ %26 = trunc i32 %extelt.offset to i16, !dbg !18
29
+ %27 = trunc i32 %22 to i16, !dbg !18
30
+ %extelt.offset1 = lshr i32 %22, 16, !dbg !18
31
+ %28 = trunc i32 %extelt.offset1 to i16, !dbg !18
32
+ %29 = trunc i32 %23 to i16, !dbg !18
33
+ %extelt.offset2 = lshr i32 %23, 16, !dbg !18
34
+ %30 = trunc i32 %extelt.offset2 to i16, !dbg !18
35
+ %31 = trunc i32 %24 to i16, !dbg !18
36
+ %extelt.offset3 = lshr i32 %24, 16, !dbg !18
37
+ %32 = trunc i32 %extelt.offset3 to i16, !dbg !18
38
+ %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #1, !dbg !19
39
+ %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #1, !dbg !19
40
+ %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #1, !dbg !19
41
+ %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %28) #1, !dbg !19
42
+ %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #1, !dbg !19
43
+ %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #1, !dbg !19
44
+ %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #1, !dbg !19
45
+ %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #1, !dbg !19
46
+ %41 = getelementptr i16, ptr addrspace(1) %1, i64 %18, !dbg !20
47
+ %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %41, i1 true) #1, !dbg !21
48
+ %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !21
49
+ %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !21
50
+ %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !21
51
+ %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !21
52
+ %47 = trunc i32 %43 to i16, !dbg !21
53
+ %extelt.offset4 = lshr i32 %43, 16, !dbg !21
54
+ %48 = trunc i32 %extelt.offset4 to i16, !dbg !21
55
+ %49 = trunc i32 %44 to i16, !dbg !21
56
+ %extelt.offset5 = lshr i32 %44, 16, !dbg !21
57
+ %50 = trunc i32 %extelt.offset5 to i16, !dbg !21
58
+ %51 = trunc i32 %45 to i16, !dbg !21
59
+ %extelt.offset6 = lshr i32 %45, 16, !dbg !21
60
+ %52 = trunc i32 %extelt.offset6 to i16, !dbg !21
61
+ %53 = trunc i32 %46 to i16, !dbg !21
62
+ %extelt.offset7 = lshr i32 %46, 16, !dbg !21
63
+ %54 = trunc i32 %extelt.offset7 to i16, !dbg !21
64
+ %55 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #1, !dbg !22
65
+ %56 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #1, !dbg !22
66
+ %57 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #1, !dbg !22
67
+ %58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %50) #1, !dbg !22
68
+ %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #1, !dbg !22
69
+ %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %52) #1, !dbg !22
70
+ %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %53) #1, !dbg !22
71
+ %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #1, !dbg !22
72
+ %63 = getelementptr i16, ptr addrspace(1) %2, i64 %18, !dbg !23
73
+ %64 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %63, i1 true) #1, !dbg !24
74
+ %65 = extractvalue { i32, i32, i32, i32 } %64, 0, !dbg !24
75
+ %66 = extractvalue { i32, i32, i32, i32 } %64, 1, !dbg !24
76
+ %67 = extractvalue { i32, i32, i32, i32 } %64, 2, !dbg !24
77
+ %68 = extractvalue { i32, i32, i32, i32 } %64, 3, !dbg !24
78
+ %69 = trunc i32 %65 to i16, !dbg !24
79
+ %extelt.offset8 = lshr i32 %65, 16, !dbg !24
80
+ %70 = trunc i32 %extelt.offset8 to i16, !dbg !24
81
+ %71 = trunc i32 %66 to i16, !dbg !24
82
+ %extelt.offset9 = lshr i32 %66, 16, !dbg !24
83
+ %72 = trunc i32 %extelt.offset9 to i16, !dbg !24
84
+ %73 = trunc i32 %67 to i16, !dbg !24
85
+ %extelt.offset10 = lshr i32 %67, 16, !dbg !24
86
+ %74 = trunc i32 %extelt.offset10 to i16, !dbg !24
87
+ %75 = trunc i32 %68 to i16, !dbg !24
88
+ %extelt.offset11 = lshr i32 %68, 16, !dbg !24
89
+ %76 = trunc i32 %extelt.offset11 to i16, !dbg !24
90
+ %77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %69) #1, !dbg !25
91
+ %78 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #1, !dbg !25
92
+ %79 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #1, !dbg !25
93
+ %80 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #1, !dbg !25
94
+ %81 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #1, !dbg !25
95
+ %82 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #1, !dbg !25
96
+ %83 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %75) #1, !dbg !25
97
+ %84 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %76) #1, !dbg !25
98
+ %85 = icmp eq i32 %13, 2, !dbg !26
99
+ %86 = select i1 %85, float %33, float 0.000000e+00, !dbg !27
100
+ %87 = select i1 %85, float %34, float 0.000000e+00, !dbg !27
101
+ %88 = select i1 %85, float %35, float 0.000000e+00, !dbg !27
102
+ %89 = select i1 %85, float %36, float 0.000000e+00, !dbg !27
103
+ %90 = select i1 %85, float %37, float 0.000000e+00, !dbg !27
104
+ %91 = select i1 %85, float %38, float 0.000000e+00, !dbg !27
105
+ %92 = select i1 %85, float %39, float 0.000000e+00, !dbg !27
106
+ %93 = select i1 %85, float %40, float 0.000000e+00, !dbg !27
107
+ %94 = icmp eq i32 %13, 1, !dbg !28
108
+ %95 = select i1 %94, float %55, float 0.000000e+00, !dbg !29
109
+ %96 = select i1 %94, float %56, float 0.000000e+00, !dbg !29
110
+ %97 = select i1 %94, float %57, float 0.000000e+00, !dbg !29
111
+ %98 = select i1 %94, float %58, float 0.000000e+00, !dbg !29
112
+ %99 = select i1 %94, float %59, float 0.000000e+00, !dbg !29
113
+ %100 = select i1 %94, float %60, float 0.000000e+00, !dbg !29
114
+ %101 = select i1 %94, float %61, float 0.000000e+00, !dbg !29
115
+ %102 = select i1 %94, float %62, float 0.000000e+00, !dbg !29
116
+ %103 = fadd float %86, %95, !dbg !30
117
+ %104 = fadd float %87, %96, !dbg !30
118
+ %105 = fadd float %88, %97, !dbg !30
119
+ %106 = fadd float %89, %98, !dbg !30
120
+ %107 = fadd float %90, %99, !dbg !30
121
+ %108 = fadd float %91, %100, !dbg !30
122
+ %109 = fadd float %92, %101, !dbg !30
123
+ %110 = fadd float %93, %102, !dbg !30
124
+ %111 = icmp eq i32 %13, 0, !dbg !31
125
+ %112 = select i1 %111, float %77, float 0.000000e+00, !dbg !32
126
+ %113 = select i1 %111, float %78, float 0.000000e+00, !dbg !32
127
+ %114 = select i1 %111, float %79, float 0.000000e+00, !dbg !32
128
+ %115 = select i1 %111, float %80, float 0.000000e+00, !dbg !32
129
+ %116 = select i1 %111, float %81, float 0.000000e+00, !dbg !32
130
+ %117 = select i1 %111, float %82, float 0.000000e+00, !dbg !32
131
+ %118 = select i1 %111, float %83, float 0.000000e+00, !dbg !32
132
+ %119 = select i1 %111, float %84, float 0.000000e+00, !dbg !32
133
+ %120 = fadd float %103, %112, !dbg !33
134
+ %121 = fadd float %104, %113, !dbg !33
135
+ %122 = fadd float %105, %114, !dbg !33
136
+ %123 = fadd float %106, %115, !dbg !33
137
+ %124 = fadd float %107, %116, !dbg !33
138
+ %125 = fadd float %108, %117, !dbg !33
139
+ %126 = fadd float %109, %118, !dbg !33
140
+ %127 = fadd float %110, %119, !dbg !33
141
+ %128 = sext i32 %11 to i64, !dbg !34
142
+ %129 = getelementptr i16, ptr addrspace(1) %3, i64 %128, !dbg !34
143
+ %130 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %120) #1, !dbg !35
144
+ %131 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %121) #1, !dbg !35
145
+ %132 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %122) #1, !dbg !35
146
+ %133 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %123) #1, !dbg !35
147
+ %134 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %124) #1, !dbg !35
148
+ %135 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %125) #1, !dbg !35
149
+ %136 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %126) #1, !dbg !35
150
+ %137 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %127) #1, !dbg !35
151
+ %138 = insertelement <2 x i16> undef, i16 %130, i64 0, !dbg !35
152
+ %139 = insertelement <2 x i16> %138, i16 %131, i64 1, !dbg !35
153
+ %140 = bitcast <2 x i16> %139 to i32, !dbg !35
154
+ %141 = insertelement <2 x i16> undef, i16 %132, i64 0, !dbg !35
155
+ %142 = insertelement <2 x i16> %141, i16 %133, i64 1, !dbg !35
156
+ %143 = bitcast <2 x i16> %142 to i32, !dbg !35
157
+ %144 = insertelement <2 x i16> undef, i16 %134, i64 0, !dbg !35
158
+ %145 = insertelement <2 x i16> %144, i16 %135, i64 1, !dbg !35
159
+ %146 = bitcast <2 x i16> %145 to i32, !dbg !35
160
+ %147 = insertelement <2 x i16> undef, i16 %136, i64 0, !dbg !35
161
+ %148 = insertelement <2 x i16> %147, i16 %137, i64 1, !dbg !35
162
+ %149 = bitcast <2 x i16> %148 to i32, !dbg !35
163
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %140, i32 %143, i32 %146, i32 %149, ptr addrspace(1) %129, i1 true) #1, !dbg !35
164
+ ret void, !dbg !36
165
+ }
166
+
167
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
168
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
169
+
170
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
171
+ attributes #1 = { nounwind }
172
+
173
+ !llvm.module.flags = !{!0}
174
+ !llvm.dbg.cu = !{!1}
175
+ !nvvm.annotations = !{!3, !4, !4, !3}
176
+
177
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
178
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
179
+ !2 = !DIFile(filename: "c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py", directory: "/tmp/torchinductor_root/63")
180
+ !3 = !{ptr @triton__0d1d2d3d4de, !"kernel", i32 1}
181
+ !4 = !{ptr @triton__0d1d2d3d4de, !"maxntidx", i32 128}
182
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4de", linkageName: "triton__0d1d2d3d4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
183
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
184
+ !7 = !{}
185
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
186
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
187
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
188
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
189
+ !12 = !DILocation(line: 23, column: 20, scope: !5)
190
+ !13 = !DILocation(line: 23, column: 27, scope: !5)
191
+ !14 = !DILocation(line: 25, column: 20, scope: !5)
192
+ !15 = !DILocation(line: 27, column: 40, scope: !5)
193
+ !16 = !DILocation(line: 27, column: 36, scope: !5)
194
+ !17 = !DILocation(line: 27, column: 30, scope: !5)
195
+ !18 = !DILocation(line: 27, column: 46, scope: !5)
196
+ !19 = !DILocation(line: 27, column: 85, scope: !5)
197
+ !20 = !DILocation(line: 28, column: 30, scope: !5)
198
+ !21 = !DILocation(line: 28, column: 46, scope: !5)
199
+ !22 = !DILocation(line: 28, column: 85, scope: !5)
200
+ !23 = !DILocation(line: 29, column: 31, scope: !5)
201
+ !24 = !DILocation(line: 29, column: 47, scope: !5)
202
+ !25 = !DILocation(line: 29, column: 86, scope: !5)
203
+ !26 = !DILocation(line: 32, column: 19, scope: !5)
204
+ !27 = !DILocation(line: 34, column: 32, scope: !5)
205
+ !28 = !DILocation(line: 36, column: 19, scope: !5)
206
+ !29 = !DILocation(line: 37, column: 32, scope: !5)
207
+ !30 = !DILocation(line: 38, column: 19, scope: !5)
208
+ !31 = !DILocation(line: 40, column: 20, scope: !5)
209
+ !32 = !DILocation(line: 41, column: 35, scope: !5)
210
+ !33 = !DILocation(line: 42, column: 20, scope: !5)
211
+ !34 = !DILocation(line: 43, column: 25, scope: !5)
212
+ !35 = !DILocation(line: 43, column: 37, scope: !5)
213
+ !36 = !DILocation(line: 43, column: 4, scope: !5)
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4de
10
+
11
+ .visible .entry triton__0d1d2d3d4de(
12
+ .param .u64 triton__0d1d2d3d4de_param_0,
13
+ .param .u64 triton__0d1d2d3d4de_param_1,
14
+ .param .u64 triton__0d1d2d3d4de_param_2,
15
+ .param .u64 triton__0d1d2d3d4de_param_3,
16
+ .param .u32 triton__0d1d2d3d4de_param_4
17
+ )
18
+ .maxntid 128, 1, 1
19
+ {
20
+ .reg .pred %p<8>;
21
+ .reg .b16 %rs<33>;
22
+ .reg .b32 %r<77>;
23
+ .reg .f32 %f<65>;
24
+ .reg .b64 %rd<11>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd5, [triton__0d1d2d3d4de_param_0];
30
+ ld.param.u64 %rd6, [triton__0d1d2d3d4de_param_1];
31
+ $L__tmp0:
32
+ .loc 1 21 36
33
+ mov.u32 %r50, %tid.x;
34
+ shl.b32 %r51, %r50, 3;
35
+ ld.param.u64 %rd7, [triton__0d1d2d3d4de_param_2];
36
+ and.b32 %r52, %r51, 1016;
37
+ ld.param.u64 %rd8, [triton__0d1d2d3d4de_param_3];
38
+ .loc 1 20 28
39
+ mov.u32 %r1, %ctaid.x;
40
+ .loc 1 20 33
41
+ shl.b32 %r53, %r1, 10;
42
+ .loc 1 21 23
43
+ or.b32 %r54, %r53, %r52;
44
+ .loc 1 23 20
45
+ shr.s32 %r56, %r54, 31;
46
+ shr.u32 %r57, %r56, 24;
47
+ add.s32 %r58, %r54, %r57;
48
+ shr.s32 %r59, %r58, 8;
49
+ .loc 1 23 27
50
+ mul.hi.s32 %r60, %r59, 1431655766;
51
+ shr.u32 %r61, %r60, 31;
52
+ add.s32 %r62, %r60, %r61;
53
+ mul.lo.s32 %r63, %r62, 3;
54
+ sub.s32 %r64, %r59, %r63;
55
+ and.b32 %r65, %r58, -256;
56
+ sub.s32 %r66, %r54, %r65;
57
+ .loc 1 25 20
58
+ mul.hi.s32 %r67, %r54, 715827883;
59
+ shr.u32 %r68, %r67, 31;
60
+ shr.u32 %r69, %r67, 7;
61
+ add.s32 %r70, %r69, %r68;
62
+ .loc 1 27 40
63
+ shl.b32 %r71, %r70, 8;
64
+ .loc 1 27 36
65
+ add.s32 %r72, %r71, %r66;
66
+ .loc 1 27 30
67
+ mul.wide.s32 %rd9, %r72, 2;
68
+ add.s64 %rd1, %rd5, %rd9;
69
+ mov.pred %p1, -1;
70
+ .loc 1 27 46
71
+ mov.u32 %r2, 0x0;
72
+ mov.u32 %r3, 0x0;
73
+ mov.u32 %r4, 0x0;
74
+ mov.u32 %r5, 0x0;
75
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
76
+ cvt.u16.u32 %rs1, %r2;
77
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
78
+ cvt.u16.u32 %rs3, %r3;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
80
+ cvt.u16.u32 %rs5, %r4;
81
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
82
+ cvt.u16.u32 %rs7, %r5;
83
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
84
+ .loc 1 27 85
85
+ cvt.f32.bf16 %r6, %rs1;
86
+ mov.b32 %f1, %r6;
87
+ cvt.f32.bf16 %r7, %rs2;
88
+ mov.b32 %f2, %r7;
89
+ cvt.f32.bf16 %r8, %rs3;
90
+ mov.b32 %f3, %r8;
91
+ cvt.f32.bf16 %r9, %rs4;
92
+ mov.b32 %f4, %r9;
93
+ cvt.f32.bf16 %r10, %rs5;
94
+ mov.b32 %f5, %r10;
95
+ cvt.f32.bf16 %r11, %rs6;
96
+ mov.b32 %f6, %r11;
97
+ cvt.f32.bf16 %r12, %rs7;
98
+ mov.b32 %f7, %r12;
99
+ cvt.f32.bf16 %r13, %rs8;
100
+ mov.b32 %f8, %r13;
101
+ .loc 1 28 30
102
+ add.s64 %rd2, %rd6, %rd9;
103
+ .loc 1 28 46
104
+ mov.u32 %r14, 0x0;
105
+ mov.u32 %r15, 0x0;
106
+ mov.u32 %r16, 0x0;
107
+ mov.u32 %r17, 0x0;
108
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ];
109
+ cvt.u16.u32 %rs9, %r14;
110
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; }
111
+ cvt.u16.u32 %rs11, %r15;
112
+ { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; }
113
+ cvt.u16.u32 %rs13, %r16;
114
+ { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; }
115
+ cvt.u16.u32 %rs15, %r17;
116
+ { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; }
117
+ .loc 1 28 85
118
+ cvt.f32.bf16 %r18, %rs9;
119
+ mov.b32 %f9, %r18;
120
+ cvt.f32.bf16 %r19, %rs10;
121
+ mov.b32 %f10, %r19;
122
+ cvt.f32.bf16 %r20, %rs11;
123
+ mov.b32 %f11, %r20;
124
+ cvt.f32.bf16 %r21, %rs12;
125
+ mov.b32 %f12, %r21;
126
+ cvt.f32.bf16 %r22, %rs13;
127
+ mov.b32 %f13, %r22;
128
+ cvt.f32.bf16 %r23, %rs14;
129
+ mov.b32 %f14, %r23;
130
+ cvt.f32.bf16 %r24, %rs15;
131
+ mov.b32 %f15, %r24;
132
+ cvt.f32.bf16 %r25, %rs16;
133
+ mov.b32 %f16, %r25;
134
+ .loc 1 29 31
135
+ add.s64 %rd3, %rd7, %rd9;
136
+ .loc 1 29 47
137
+ mov.u32 %r26, 0x0;
138
+ mov.u32 %r27, 0x0;
139
+ mov.u32 %r28, 0x0;
140
+ mov.u32 %r29, 0x0;
141
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd3 + 0 ];
142
+ cvt.u16.u32 %rs17, %r26;
143
+ { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r26; }
144
+ cvt.u16.u32 %rs19, %r27;
145
+ { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r27; }
146
+ cvt.u16.u32 %rs21, %r28;
147
+ { .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r28; }
148
+ cvt.u16.u32 %rs23, %r29;
149
+ { .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r29; }
150
+ .loc 1 29 86
151
+ cvt.f32.bf16 %r30, %rs17;
152
+ mov.b32 %f17, %r30;
153
+ cvt.f32.bf16 %r31, %rs18;
154
+ mov.b32 %f18, %r31;
155
+ cvt.f32.bf16 %r32, %rs19;
156
+ mov.b32 %f19, %r32;
157
+ cvt.f32.bf16 %r33, %rs20;
158
+ mov.b32 %f20, %r33;
159
+ cvt.f32.bf16 %r34, %rs21;
160
+ mov.b32 %f21, %r34;
161
+ cvt.f32.bf16 %r35, %rs22;
162
+ mov.b32 %f22, %r35;
163
+ cvt.f32.bf16 %r36, %rs23;
164
+ mov.b32 %f23, %r36;
165
+ cvt.f32.bf16 %r37, %rs24;
166
+ mov.b32 %f24, %r37;
167
+ .loc 1 32 19
168
+ setp.eq.s32 %p5, %r64, 2;
169
+ .loc 1 34 32
170
+ selp.f32 %f25, %f1, 0f00000000, %p5;
171
+ selp.f32 %f26, %f2, 0f00000000, %p5;
172
+ selp.f32 %f27, %f3, 0f00000000, %p5;
173
+ selp.f32 %f28, %f4, 0f00000000, %p5;
174
+ selp.f32 %f29, %f5, 0f00000000, %p5;
175
+ selp.f32 %f30, %f6, 0f00000000, %p5;
176
+ selp.f32 %f31, %f7, 0f00000000, %p5;
177
+ selp.f32 %f32, %f8, 0f00000000, %p5;
178
+ .loc 1 36 19
179
+ setp.eq.s32 %p6, %r64, 1;
180
+ .loc 1 37 32
181
+ selp.f32 %f33, %f9, 0f00000000, %p6;
182
+ selp.f32 %f34, %f10, 0f00000000, %p6;
183
+ selp.f32 %f35, %f11, 0f00000000, %p6;
184
+ selp.f32 %f36, %f12, 0f00000000, %p6;
185
+ selp.f32 %f37, %f13, 0f00000000, %p6;
186
+ selp.f32 %f38, %f14, 0f00000000, %p6;
187
+ selp.f32 %f39, %f15, 0f00000000, %p6;
188
+ selp.f32 %f40, %f16, 0f00000000, %p6;
189
+ .loc 1 38 19
190
+ add.f32 %f41, %f25, %f33;
191
+ add.f32 %f42, %f26, %f34;
192
+ add.f32 %f43, %f27, %f35;
193
+ add.f32 %f44, %f28, %f36;
194
+ add.f32 %f45, %f29, %f37;
195
+ add.f32 %f46, %f30, %f38;
196
+ add.f32 %f47, %f31, %f39;
197
+ add.f32 %f48, %f32, %f40;
198
+ .loc 1 40 20
199
+ setp.eq.s32 %p7, %r64, 0;
200
+ .loc 1 41 35
201
+ selp.f32 %f49, %f17, 0f00000000, %p7;
202
+ selp.f32 %f50, %f18, 0f00000000, %p7;
203
+ selp.f32 %f51, %f19, 0f00000000, %p7;
204
+ selp.f32 %f52, %f20, 0f00000000, %p7;
205
+ selp.f32 %f53, %f21, 0f00000000, %p7;
206
+ selp.f32 %f54, %f22, 0f00000000, %p7;
207
+ selp.f32 %f55, %f23, 0f00000000, %p7;
208
+ selp.f32 %f56, %f24, 0f00000000, %p7;
209
+ .loc 1 42 20
210
+ add.f32 %f57, %f41, %f49;
211
+ add.f32 %f58, %f42, %f50;
212
+ add.f32 %f59, %f43, %f51;
213
+ add.f32 %f60, %f44, %f52;
214
+ add.f32 %f61, %f45, %f53;
215
+ add.f32 %f62, %f46, %f54;
216
+ add.f32 %f63, %f47, %f55;
217
+ add.f32 %f64, %f48, %f56;
218
+ .loc 1 43 25
219
+ mul.wide.s32 %rd10, %r54, 2;
220
+ add.s64 %rd4, %rd8, %rd10;
221
+ .loc 1 43 37
222
+ mov.b32 %r38, %f57;
223
+ cvt.rn.bf16.f32 %rs25, %r38;
224
+ mov.b32 %r39, %f58;
225
+ cvt.rn.bf16.f32 %rs26, %r39;
226
+ mov.b32 %r40, %f59;
227
+ cvt.rn.bf16.f32 %rs27, %r40;
228
+ mov.b32 %r41, %f60;
229
+ cvt.rn.bf16.f32 %rs28, %r41;
230
+ mov.b32 %r42, %f61;
231
+ cvt.rn.bf16.f32 %rs29, %r42;
232
+ mov.b32 %r43, %f62;
233
+ cvt.rn.bf16.f32 %rs30, %r43;
234
+ mov.b32 %r44, %f63;
235
+ cvt.rn.bf16.f32 %rs31, %r44;
236
+ mov.b32 %r45, %f64;
237
+ cvt.rn.bf16.f32 %rs32, %r45;
238
+ mov.b32 %r73, {%rs25, %rs26};
239
+ mov.b32 %r74, {%rs27, %rs28};
240
+ mov.b32 %r75, {%rs29, %rs30};
241
+ mov.b32 %r76, {%rs31, %rs32};
242
+ @%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r73, %r74, %r75, %r76 };
243
+ .loc 1 43 4
244
+ ret;
245
+ $L__tmp1:
246
+ $L__func_end0:
247
+
248
+ }
249
+ .file 1 "/tmp/torchinductor_root/63/c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py"
250
+ .section .debug_abbrev
251
+ {
252
+ .b8 1
253
+ .b8 17
254
+ .b8 1
255
+ .b8 37
256
+ .b8 8
257
+ .b8 19
258
+ .b8 5
259
+ .b8 3
260
+ .b8 8
261
+ .b8 16
262
+ .b8 6
263
+ .b8 27
264
+ .b8 8
265
+ .b8 180
266
+ .b8 66
267
+ .b8 12
268
+ .b8 17
269
+ .b8 1
270
+ .b8 18
271
+ .b8 1
272
+ .b8 0
273
+ .b8 0
274
+ .b8 2
275
+ .b8 46
276
+ .b8 0
277
+ .b8 17
278
+ .b8 1
279
+ .b8 18
280
+ .b8 1
281
+ .b8 64
282
+ .b8 10
283
+ .b8 135
284
+ .b8 64
285
+ .b8 8
286
+ .b8 3
287
+ .b8 8
288
+ .b8 58
289
+ .b8 11
290
+ .b8 59
291
+ .b8 11
292
+ .b8 63
293
+ .b8 12
294
+ .b8 0
295
+ .b8 0
296
+ .b8 0
297
+ }
298
+ .section .debug_info
299
+ {
300
+ .b32 184
301
+ .b8 2
302
+ .b8 0
303
+ .b32 .debug_abbrev
304
+ .b8 8
305
+ .b8 1
306
+ .b8 116
307
+ .b8 114
308
+ .b8 105
309
+ .b8 116
310
+ .b8 111
311
+ .b8 110
312
+ .b8 0
313
+ .b8 2
314
+ .b8 0
315
+ .b8 99
316
+ .b8 54
317
+ .b8 51
318
+ .b8 114
319
+ .b8 55
320
+ .b8 105
321
+ .b8 117
322
+ .b8 114
323
+ .b8 119
324
+ .b8 107
325
+ .b8 53
326
+ .b8 121
327
+ .b8 100
328
+ .b8 108
329
+ .b8 115
330
+ .b8 119
331
+ .b8 104
332
+ .b8 55
333
+ .b8 114
334
+ .b8 118
335
+ .b8 104
336
+ .b8 99
337
+ .b8 109
338
+ .b8 108
339
+ .b8 120
340
+ .b8 50
341
+ .b8 99
342
+ .b8 102
343
+ .b8 114
344
+ .b8 101
345
+ .b8 116
346
+ .b8 108
347
+ .b8 114
348
+ .b8 101
349
+ .b8 119
350
+ .b8 103
351
+ .b8 119
352
+ .b8 54
353
+ .b8 116
354
+ .b8 108
355
+ .b8 106
356
+ .b8 108
357
+ .b8 117
358
+ .b8 114
359
+ .b8 115
360
+ .b8 115
361
+ .b8 104
362
+ .b8 103
363
+ .b8 116
364
+ .b8 102
365
+ .b8 112
366
+ .b8 112
367
+ .b8 46
368
+ .b8 112
369
+ .b8 121
370
+ .b8 0
371
+ .b32 .debug_line
372
+ .b8 47
373
+ .b8 116
374
+ .b8 109
375
+ .b8 112
376
+ .b8 47
377
+ .b8 116
378
+ .b8 111
379
+ .b8 114
380
+ .b8 99
381
+ .b8 104
382
+ .b8 105
383
+ .b8 110
384
+ .b8 100
385
+ .b8 117
386
+ .b8 99
387
+ .b8 116
388
+ .b8 111
389
+ .b8 114
390
+ .b8 95
391
+ .b8 114
392
+ .b8 111
393
+ .b8 111
394
+ .b8 116
395
+ .b8 47
396
+ .b8 54
397
+ .b8 51
398
+ .b8 0
399
+ .b8 1
400
+ .b64 $L__func_begin0
401
+ .b64 $L__func_end0
402
+ .b8 2
403
+ .b64 $L__func_begin0
404
+ .b64 $L__func_end0
405
+ .b8 1
406
+ .b8 156
407
+ .b8 116
408
+ .b8 114
409
+ .b8 105
410
+ .b8 116
411
+ .b8 111
412
+ .b8 110
413
+ .b8 95
414
+ .b8 95
415
+ .b8 48
416
+ .b8 100
417
+ .b8 49
418
+ .b8 100
419
+ .b8 50
420
+ .b8 100
421
+ .b8 51
422
+ .b8 100
423
+ .b8 52
424
+ .b8 100
425
+ .b8 101
426
+ .b8 0
427
+ .b8 116
428
+ .b8 114
429
+ .b8 105
430
+ .b8 116
431
+ .b8 111
432
+ .b8 110
433
+ .b8 95
434
+ .b8 95
435
+ .b8 48
436
+ .b8 100
437
+ .b8 49
438
+ .b8 100
439
+ .b8 50
440
+ .b8 100
441
+ .b8 51
442
+ .b8 100
443
+ .b8 52
444
+ .b8 100
445
+ .b8 101
446
+ .b8 0
447
+ .b8 1
448
+ .b8 18
449
+ .b8 1
450
+ .b8 0
451
+ }
452
+ .section .debug_pubnames
453
+ {
454
+ .b32 $L__pubNames_end0-$L__pubNames_start0
455
+ $L__pubNames_start0:
456
+ .b8 2
457
+ .b8 0
458
+ .b32 .debug_info
459
+ .b32 188
460
+ .b32 125
461
+ .b8 116
462
+ .b8 114
463
+ .b8 105
464
+ .b8 116
465
+ .b8 111
466
+ .b8 110
467
+ .b8 95
468
+ .b8 95
469
+ .b8 48
470
+ .b8 100
471
+ .b8 49
472
+ .b8 100
473
+ .b8 50
474
+ .b8 100
475
+ .b8 51
476
+ .b8 100
477
+ .b8 52
478
+ .b8 100
479
+ .b8 101
480
+ .b8 0
481
+ .b32 0
482
+ $L__pubNames_end0:
483
+ }
484
+ .section .debug_pubtypes
485
+ {
486
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
487
+ $L__pubTypes_start0:
488
+ .b8 2
489
+ .b8 0
490
+ .b32 .debug_info
491
+ .b32 188
492
+ .b32 0
493
+ $L__pubTypes_end0:
494
+ }
495
+ .section .debug_loc { }
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
11
+
12
+ .visible .entry triton__0d1d2de(
13
+ .param .u64 triton__0d1d2de_param_0,
14
+ .param .u64 triton__0d1d2de_param_1,
15
+ .param .u32 triton__0d1d2de_param_2
16
+ )
17
+ .maxntid 256, 1, 1
18
+ {
19
+ .reg .pred %p<10>;
20
+ .reg .b16 %rs<7>;
21
+ .reg .b32 %r<25>;
22
+ .reg .f32 %f<127>;
23
+ .reg .b64 %rd<8>;
24
+ .loc 1 18 0
25
+ $L__func_begin0:
26
+ .loc 1 18 0
27
+
28
+ ld.param.u64 %rd4, [triton__0d1d2de_param_0];
29
+ ld.param.u64 %rd5, [triton__0d1d2de_param_1];
30
+ $L__tmp0:
31
+ .loc 1 21 36
32
+ mov.u32 %r8, %tid.x;
33
+ shl.b32 %r9, %r8, 1;
34
+ and.b32 %r10, %r9, 510;
35
+ .loc 1 20 28
36
+ mov.u32 %r1, %ctaid.x;
37
+ .loc 1 20 33
38
+ shl.b32 %r11, %r1, 9;
39
+ .loc 1 21 23
40
+ or.b32 %r12, %r11, %r10;
41
+ .loc 1 24 34
42
+ mul.wide.s32 %rd6, %r12, 2;
43
+ add.s64 %rd7, %rd4, %rd6;
44
+ mov.pred %p1, -1;
45
+ .loc 1 24 39
46
+ mov.u32 %r2, 0x0;
47
+ @%p1 ld.global.b32 { %r2 }, [ %rd7 + 0 ];
48
+ .loc 1 25 30
49
+ add.s64 %rd3, %rd5, %rd6;
50
+ .loc 1 25 35
51
+ mov.u32 %r5, 0x0;
52
+ @%p1 ld.global.b32 { %r5 }, [ %rd3 + 0 ];
53
+ cvt.u16.u32 %rs3, %r5;
54
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r5; }
55
+ .loc 1 25 44
56
+ cvt.f32.bf16 %r6, %rs3;
57
+ mov.b32 %f3, %r6;
58
+ cvt.f32.bf16 %r7, %rs4;
59
+ mov.b32 %f4, %r7;
60
+ .loc 1 29 18
61
+ mul.f32 %f5, %f3, 0f3F3504F3;
62
+ .loc 1 30 23
63
+ abs.ftz.f32 %f7, %f5;
64
+ setp.ge.f32 %p3, %f7, 0f3F8060FE;
65
+ mov.f32 %f115, 0f3789CA3C;
66
+ mov.f32 %f114, 0fB9F560B9;
67
+ mov.f32 %f113, 0f3BAC840B;
68
+ mov.f32 %f112, 0fBD0C8162;
69
+ mov.f32 %f111, 0f3E1CF906;
70
+ mov.f32 %f110, 0f3F6A937E;
71
+ mov.f32 %f109, 0f3F20D842;
72
+ mov.f32 %f116, %f7;
73
+ @%p3 bra $L__BB0_2;
74
+ .loc 1 0 23
75
+ mov.f32 %f115, 0f38B1E96A;
76
+ mov.f32 %f114, 0fBA574D20;
77
+ mov.f32 %f113, 0f3BAAD5EA;
78
+ mov.f32 %f112, 0fBCDC1BE7;
79
+ mov.f32 %f111, 0f3DE718AF;
80
+ mov.f32 %f110, 0fBEC093AC;
81
+ mov.f32 %f109, 0f3E0375D3;
82
+ .loc 1 30 23
83
+ mul.f32 %f116, %f5, %f5;
84
+ $L__BB0_2:
85
+ .loc 1 0 0
86
+ cvt.u16.u32 %rs1, %r2;
87
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
88
+ mul.f32 %f6, %f4, 0f3F3504F3;
89
+ .loc 1 30 23
90
+ setp.ltu.f32 %p4, %f7, 0f3F8060FE;
91
+ fma.rn.ftz.f32 %f47, %f115, %f116, %f114;
92
+ fma.rn.ftz.f32 %f48, %f47, %f116, %f113;
93
+ fma.rn.ftz.f32 %f49, %f48, %f116, %f112;
94
+ fma.rn.ftz.f32 %f50, %f49, %f116, %f111;
95
+ fma.rn.ftz.f32 %f51, %f50, %f116, %f110;
96
+ fma.rn.ftz.f32 %f52, %f51, %f116, %f109;
97
+ neg.f32 %f53, %f116;
98
+ selp.f32 %f54, %f53, %f5, %p3;
99
+ fma.rn.ftz.f32 %f117, %f52, %f54, %f54;
100
+ mov.f32 %f108, 0f3F800000;
101
+ @%p4 bra $L__BB0_4;
102
+ ex2.approx.ftz.f32 %f55, %f117;
103
+ sub.f32 %f57, %f108, %f55;
104
+ mov.b32 %r13, %f57;
105
+ mov.b32 %r14, %f5;
106
+ and.b32 %r15, %r14, -2147483648;
107
+ or.b32 %r16, %r15, %r13;
108
+ mov.b32 %f117, %r16;
109
+ $L__BB0_4:
110
+ .loc 1 0 0
111
+ cvt.f32.bf16 %r3, %rs1;
112
+ cvt.f32.bf16 %r4, %rs2;
113
+ .loc 1 30 23
114
+ abs.ftz.f32 %f20, %f6;
115
+ setp.ge.f32 %p6, %f20, 0f3F8060FE;
116
+ mov.f32 %f124, 0f3789CA3C;
117
+ mov.f32 %f123, 0fB9F560B9;
118
+ mov.f32 %f122, 0f3BAC840B;
119
+ mov.f32 %f121, 0fBD0C8162;
120
+ mov.f32 %f120, 0f3E1CF906;
121
+ mov.f32 %f119, 0f3F6A937E;
122
+ mov.f32 %f118, 0f3F20D842;
123
+ mov.f32 %f125, %f20;
124
+ @%p6 bra $L__BB0_6;
125
+ mul.f32 %f125, %f6, %f6;
126
+ mov.f32 %f124, 0f38B1E96A;
127
+ mov.f32 %f123, 0fBA574D20;
128
+ mov.f32 %f122, 0f3BAAD5EA;
129
+ mov.f32 %f121, 0fBCDC1BE7;
130
+ mov.f32 %f120, 0f3DE718AF;
131
+ mov.f32 %f119, 0fBEC093AC;
132
+ mov.f32 %f118, 0f3E0375D3;
133
+ $L__BB0_6:
134
+ .loc 1 0 0
135
+ mov.b32 %f1, %r3;
136
+ mov.b32 %f2, %r4;
137
+ .loc 1 30 23
138
+ setp.ltu.f32 %p7, %f20, 0f3F8060FE;
139
+ fma.rn.ftz.f32 %f72, %f124, %f125, %f123;
140
+ fma.rn.ftz.f32 %f73, %f72, %f125, %f122;
141
+ fma.rn.ftz.f32 %f74, %f73, %f125, %f121;
142
+ fma.rn.ftz.f32 %f75, %f74, %f125, %f120;
143
+ fma.rn.ftz.f32 %f76, %f75, %f125, %f119;
144
+ fma.rn.ftz.f32 %f77, %f76, %f125, %f118;
145
+ neg.f32 %f78, %f125;
146
+ selp.f32 %f79, %f78, %f6, %p6;
147
+ fma.rn.ftz.f32 %f126, %f77, %f79, %f79;
148
+ @%p7 bra $L__BB0_8;
149
+ ex2.approx.ftz.f32 %f80, %f126;
150
+ sub.f32 %f82, %f108, %f80;
151
+ mov.b32 %r17, %f82;
152
+ mov.b32 %r18, %f6;
153
+ and.b32 %r19, %r18, -2147483648;
154
+ or.b32 %r20, %r19, %r17;
155
+ mov.b32 %f126, %r20;
156
+ $L__BB0_8:
157
+ .loc 1 32 18
158
+ add.f32 %f87, %f117, 0f3F800000;
159
+ add.f32 %f88, %f126, 0f3F800000;
160
+ .loc 1 35 19
161
+ mul.f32 %f89, %f3, %f3;
162
+ mul.f32 %f90, %f4, %f4;
163
+ .loc 1 37 20
164
+ mul.f32 %f91, %f89, 0fBF000000;
165
+ mul.f32 %f92, %f90, 0fBF000000;
166
+ .loc 1 38 19
167
+ mul.f32 %f84, %f91, 0f3FB8AA3B;
168
+ ex2.approx.f32 %f83, %f84;
169
+ mul.f32 %f86, %f92, 0f3FB8AA3B;
170
+ ex2.approx.f32 %f85, %f86;
171
+ .loc 1 40 20
172
+ mul.f32 %f93, %f83, 0f3ECC422A;
173
+ mul.f32 %f94, %f85, 0f3ECC422A;
174
+ .loc 1 41 19
175
+ mul.f32 %f95, %f3, %f93;
176
+ mul.f32 %f96, %f4, %f94;
177
+ .loc 1 42 20
178
+ fma.rn.f32 %f97, %f87, 0f3F000000, %f95;
179
+ fma.rn.f32 %f98, %f88, 0f3F000000, %f96;
180
+ .loc 1 43 19
181
+ mul.f32 %f99, %f1, %f97;
182
+ mul.f32 %f100, %f2, %f98;
183
+ .loc 1 45 40
184
+ mov.b32 %r21, %f99;
185
+ cvt.rn.bf16.f32 %rs5, %r21;
186
+ mov.b32 %r22, %f100;
187
+ cvt.rn.bf16.f32 %rs6, %r22;
188
+ mov.b32 %r24, {%rs5, %rs6};
189
+ @%p1 st.global.b32 [ %rd7 + 0 ], { %r24 };
190
+ .loc 1 45 4
191
+ ret;
192
+ $L__tmp1:
193
+ $L__func_end0:
194
+
195
+ }
196
+ // .globl __nv_erff
197
+ .visible .func (.param .b32 func_retval0) __nv_erff(
198
+ .param .b32 __nv_erff_param_0
199
+ )
200
+ {
201
+ .reg .pred %p<4>;
202
+ .reg .b32 %r<5>;
203
+ .reg .f32 %f<49>;
204
+ $L__func_begin1:
205
+
206
+ ld.param.f32 %f14, [__nv_erff_param_0];
207
+ abs.ftz.f32 %f1, %f14;
208
+ setp.ge.f32 %p1, %f1, 0f3F8060FE;
209
+ mov.f32 %f46, 0f3789CA3C;
210
+ mov.f32 %f45, 0fB9F560B9;
211
+ mov.f32 %f44, 0f3BAC840B;
212
+ mov.f32 %f43, 0fBD0C8162;
213
+ mov.f32 %f42, 0f3E1CF906;
214
+ mov.f32 %f41, 0f3F6A937E;
215
+ mov.f32 %f40, 0f3F20D842;
216
+ mov.f32 %f47, %f1;
217
+ @%p1 bra $L__BB1_2;
218
+ mul.f32 %f47, %f14, %f14;
219
+ mov.f32 %f46, 0f38B1E96A;
220
+ mov.f32 %f45, 0fBA574D20;
221
+ mov.f32 %f44, 0f3BAAD5EA;
222
+ mov.f32 %f43, 0fBCDC1BE7;
223
+ mov.f32 %f42, 0f3DE718AF;
224
+ mov.f32 %f41, 0fBEC093AC;
225
+ mov.f32 %f40, 0f3E0375D3;
226
+ $L__BB1_2:
227
+ setp.ltu.f32 %p2, %f1, 0f3F8060FE;
228
+ fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
229
+ fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
230
+ fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
231
+ fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
232
+ fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
233
+ fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
234
+ neg.f32 %f35, %f47;
235
+ selp.f32 %f36, %f35, %f14, %p1;
236
+ fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
237
+ @%p2 bra $L__BB1_4;
238
+ ex2.approx.ftz.f32 %f37, %f48;
239
+ mov.f32 %f38, 0f3F800000;
240
+ sub.f32 %f39, %f38, %f37;
241
+ mov.b32 %r1, %f39;
242
+ mov.b32 %r2, %f14;
243
+ and.b32 %r3, %r2, -2147483648;
244
+ or.b32 %r4, %r3, %r1;
245
+ mov.b32 %f48, %r4;
246
+ $L__BB1_4:
247
+ st.param.f32 [func_retval0+0], %f48;
248
+ ret;
249
+ $L__func_end1:
250
+
251
+ }
252
+ .file 1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py"
253
+ .section .debug_abbrev
254
+ {
255
+ .b8 1
256
+ .b8 17
257
+ .b8 1
258
+ .b8 37
259
+ .b8 8
260
+ .b8 19
261
+ .b8 5
262
+ .b8 3
263
+ .b8 8
264
+ .b8 16
265
+ .b8 6
266
+ .b8 27
267
+ .b8 8
268
+ .b8 180
269
+ .b8 66
270
+ .b8 12
271
+ .b8 17
272
+ .b8 1
273
+ .b8 18
274
+ .b8 1
275
+ .b8 0
276
+ .b8 0
277
+ .b8 2
278
+ .b8 46
279
+ .b8 0
280
+ .b8 17
281
+ .b8 1
282
+ .b8 18
283
+ .b8 1
284
+ .b8 64
285
+ .b8 10
286
+ .b8 135
287
+ .b8 64
288
+ .b8 8
289
+ .b8 3
290
+ .b8 8
291
+ .b8 58
292
+ .b8 11
293
+ .b8 59
294
+ .b8 11
295
+ .b8 63
296
+ .b8 12
297
+ .b8 0
298
+ .b8 0
299
+ .b8 0
300
+ }
301
+ .section .debug_info
302
+ {
303
+ .b32 176
304
+ .b8 2
305
+ .b8 0
306
+ .b32 .debug_abbrev
307
+ .b8 8
308
+ .b8 1
309
+ .b8 116
310
+ .b8 114
311
+ .b8 105
312
+ .b8 116
313
+ .b8 111
314
+ .b8 110
315
+ .b8 0
316
+ .b8 2
317
+ .b8 0
318
+ .b8 99
319
+ .b8 53
320
+ .b8 106
321
+ .b8 120
322
+ .b8 97
323
+ .b8 103
324
+ .b8 117
325
+ .b8 120
326
+ .b8 104
327
+ .b8 111
328
+ .b8 51
329
+ .b8 110
330
+ .b8 104
331
+ .b8 114
332
+ .b8 108
333
+ .b8 116
334
+ .b8 53
335
+ .b8 118
336
+ .b8 99
337
+ .b8 105
338
+ .b8 110
339
+ .b8 110
340
+ .b8 122
341
+ .b8 53
342
+ .b8 102
343
+ .b8 101
344
+ .b8 118
345
+ .b8 111
346
+ .b8 100
347
+ .b8 117
348
+ .b8 109
349
+ .b8 108
350
+ .b8 112
351
+ .b8 119
352
+ .b8 110
353
+ .b8 52
354
+ .b8 119
355
+ .b8 121
356
+ .b8 98
357
+ .b8 50
358
+ .b8 118
359
+ .b8 120
360
+ .b8 51
361
+ .b8 120
362
+ .b8 114
363
+ .b8 118
364
+ .b8 101
365
+ .b8 105
366
+ .b8 99
367
+ .b8 101
368
+ .b8 114
369
+ .b8 108
370
+ .b8 46
371
+ .b8 112
372
+ .b8 121
373
+ .b8 0
374
+ .b32 .debug_line
375
+ .b8 47
376
+ .b8 116
377
+ .b8 109
378
+ .b8 112
379
+ .b8 47
380
+ .b8 116
381
+ .b8 111
382
+ .b8 114
383
+ .b8 99
384
+ .b8 104
385
+ .b8 105
386
+ .b8 110
387
+ .b8 100
388
+ .b8 117
389
+ .b8 99
390
+ .b8 116
391
+ .b8 111
392
+ .b8 114
393
+ .b8 95
394
+ .b8 114
395
+ .b8 111
396
+ .b8 111
397
+ .b8 116
398
+ .b8 47
399
+ .b8 53
400
+ .b8 106
401
+ .b8 0
402
+ .b8 1
403
+ .b64 $L__func_begin0
404
+ .b64 $L__func_end0
405
+ .b8 2
406
+ .b64 $L__func_begin0
407
+ .b64 $L__func_end0
408
+ .b8 1
409
+ .b8 156
410
+ .b8 116
411
+ .b8 114
412
+ .b8 105
413
+ .b8 116
414
+ .b8 111
415
+ .b8 110
416
+ .b8 95
417
+ .b8 95
418
+ .b8 48
419
+ .b8 100
420
+ .b8 49
421
+ .b8 100
422
+ .b8 50
423
+ .b8 100
424
+ .b8 101
425
+ .b8 0
426
+ .b8 116
427
+ .b8 114
428
+ .b8 105
429
+ .b8 116
430
+ .b8 111
431
+ .b8 110
432
+ .b8 95
433
+ .b8 95
434
+ .b8 48
435
+ .b8 100
436
+ .b8 49
437
+ .b8 100
438
+ .b8 50
439
+ .b8 100
440
+ .b8 101
441
+ .b8 0
442
+ .b8 1
443
+ .b8 18
444
+ .b8 1
445
+ .b8 0
446
+ }
447
+ .section .debug_pubnames
448
+ {
449
+ .b32 $L__pubNames_end0-$L__pubNames_start0
450
+ $L__pubNames_start0:
451
+ .b8 2
452
+ .b8 0
453
+ .b32 .debug_info
454
+ .b32 180
455
+ .b32 125
456
+ .b8 116
457
+ .b8 114
458
+ .b8 105
459
+ .b8 116
460
+ .b8 111
461
+ .b8 110
462
+ .b8 95
463
+ .b8 95
464
+ .b8 48
465
+ .b8 100
466
+ .b8 49
467
+ .b8 100
468
+ .b8 50
469
+ .b8 100
470
+ .b8 101
471
+ .b8 0
472
+ .b32 0
473
+ $L__pubNames_end0:
474
+ }
475
+ .section .debug_pubtypes
476
+ {
477
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
478
+ $L__pubTypes_start0:
479
+ .b8 2
480
+ .b8 0
481
+ .b32 .debug_info
482
+ .b32 180
483
+ .b32 0
484
+ $L__pubTypes_end0:
485
+ }
486
+ .section .debug_loc { }
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.398942292> : tensor<512xf32, #blocked>
5
+ %cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32, #blocked>
6
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32, #blocked>
7
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked>
8
+ %cst_3 = arith.constant dense<0.707106769> : tensor<512xf32, #blocked>
9
+ %c512_i32 = arith.constant 512 : i32
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = arith.muli %0, %c512_i32 : i32
12
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
13
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
14
+ %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
15
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
16
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
17
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
18
+ %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
20
+ %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
21
+ %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
22
+ %12 = arith.extf %11 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
23
+ %13 = arith.mulf %12, %cst_3 : tensor<512xf32, #blocked>
24
+ %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32, #blocked>) -> tensor<512xf32, #blocked>
25
+ %15 = arith.addf %14, %cst_2 : tensor<512xf32, #blocked>
26
+ %16 = arith.mulf %15, %cst_1 : tensor<512xf32, #blocked>
27
+ %17 = arith.mulf %12, %12 : tensor<512xf32, #blocked>
28
+ %18 = arith.mulf %17, %cst_0 : tensor<512xf32, #blocked>
29
+ %19 = math.exp %18 : tensor<512xf32, #blocked>
30
+ %20 = arith.mulf %19, %cst : tensor<512xf32, #blocked>
31
+ %21 = arith.mulf %12, %20 : tensor<512xf32, #blocked>
32
+ %22 = arith.addf %16, %21 : tensor<512xf32, #blocked>
33
+ %23 = arith.mulf %8, %22 : tensor<512xf32, #blocked>
34
+ %24 = arith.truncf %23 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
35
+ tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
36
+ tt.return
37
+ }
38
+ }
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<256> : tensor<16x1xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<16x1xi64>
5
+ %cst_1 = arith.constant dense<512> : tensor<16x1xi64>
6
+ %cst_2 = arith.constant dense<true> : tensor<16x1xi1>
7
+ %cst_3 = arith.constant dense<256> : tensor<16x1xi32>
8
+ %cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
9
+ %cst_5 = arith.constant dense<120> : tensor<1x128xi32>
10
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<16x128xf32>
11
+ %c16_i32 = arith.constant 16 : i32
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.muli %0, %c16_i32 : i32
14
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
15
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
16
+ %4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
17
+ %5 = arith.addi %4, %3 : tensor<16x1xi32>
18
+ %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
19
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
20
+ %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
21
+ %9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
22
+ %10 = tt.broadcast %5 : (tensor<16x1xi32>) -> tensor<16x128xi32>
23
+ %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<16x128xi32>
24
+ %12 = arith.addi %10, %11 : tensor<16x128xi32>
25
+ %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
26
+ %14 = tt.addptr %13, %12 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
27
+ %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<16x128xi1>
28
+ %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32>
29
+ %17 = arith.addf %16, %cst_6 : tensor<16x128xf32>
30
+ %18 = arith.select %15, %17, %cst_6 : tensor<16x128xi1>, tensor<16x128xf32>
31
+ %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
32
+ ^bb0(%arg5: f32, %arg6: f32):
33
+ %35 = arith.addf %arg5, %arg6 : f32
34
+ tt.reduce.return %35 : f32
35
+ }) : (tensor<16x128xf32>) -> tensor<16xf32>
36
+ %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
37
+ %21 = arith.divsi %5, %cst_3 : tensor<16x1xi32>
38
+ %22 = arith.remsi %5, %cst_3 : tensor<16x1xi32>
39
+ %23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
40
+ %24 = tt.addptr %23, %21 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
41
+ %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
42
+ %26 = arith.addi %25, %cst_1 : tensor<16x1xi64>
43
+ %27 = arith.cmpi slt, %25, %cst_0 : tensor<16x1xi64>
44
+ %28 = arith.select %27, %26, %25 : tensor<16x1xi1>, tensor<16x1xi64>
45
+ %29 = arith.muli %28, %cst : tensor<16x1xi64>
46
+ %30 = arith.extsi %22 : tensor<16x1xi32> to tensor<16x1xi64>
47
+ %31 = arith.addi %30, %29 : tensor<16x1xi64>
48
+ %32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x1x!tt.ptr<f32, 1>>
49
+ %33 = tt.addptr %32, %31 : tensor<16x1x!tt.ptr<f32, 1>>, tensor<16x1xi64>
50
+ %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<16x1x!tt.ptr<f32, 1>>, tensor<16x1xf32>, tensor<16x1xi1>) -> tensor<16x1xf32>
51
+ tt.return
52
+ }
53
+ }
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.cubin ADDED
Binary file (13.9 kB). View file
 
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ptx ADDED
@@ -0,0 +1,709 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
13
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
19
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
20
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
21
+ )
22
+ .maxntid 64, 1, 1
23
+ {
24
+ .reg .pred %p<33>;
25
+ .reg .b16 %rs<9>;
26
+ .reg .b32 %r<106>;
27
+ .reg .f32 %f<73>;
28
+ .reg .b64 %rd<21>;
29
+ .loc 1 18 0
30
+ $L__func_begin0:
31
+ .loc 1 18 0
32
+
33
+ ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6de7de_param_0];
34
+ ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_1];
35
+ $L__tmp0:
36
+ .loc 1 26 26
37
+ mov.u32 %r72, %tid.x;
38
+ and.b32 %r73, %r72, 31;
39
+ ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6de7de_param_2];
40
+ ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6de7de_param_3];
41
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6de7de_param_4];
42
+ shl.b32 %r74, %r72, 2;
43
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_5];
44
+ and.b32 %r75, %r74, 252;
45
+ .loc 1 23 28
46
+ mov.u32 %r1, %ctaid.x;
47
+ .loc 1 30 40
48
+ shl.b32 %r76, %r1, 8;
49
+ .loc 1 30 36
50
+ or.b32 %r77, %r76, %r75;
51
+ .loc 1 30 30
52
+ mul.wide.s32 %rd17, %r77, 2;
53
+ add.s64 %rd1, %rd12, %rd17;
54
+ mov.b32 %r4, 0;
55
+ mov.pred %p1, -1;
56
+ .loc 1 30 46
57
+ mov.u32 %r2, 0x0;
58
+ mov.u32 %r3, 0x0;
59
+ @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
60
+ @!%p1 mov.u32 %r2, %r4;
61
+ @!%p1 mov.u32 %r3, %r4;
62
+ cvt.u16.u32 %rs1, %r2;
63
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
64
+ cvt.u16.u32 %rs3, %r3;
65
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
66
+ .loc 1 30 67
67
+ cvt.f32.bf16 %r6, %rs1;
68
+ mov.b32 %f1, %r6;
69
+ cvt.f32.bf16 %r7, %rs2;
70
+ mov.b32 %f2, %r7;
71
+ cvt.f32.bf16 %r8, %rs3;
72
+ mov.b32 %f3, %r8;
73
+ cvt.f32.bf16 %r9, %rs4;
74
+ mov.b32 %f4, %r9;
75
+ .loc 1 31 30
76
+ mul.wide.u32 %rd18, %r75, 4;
77
+ add.s64 %rd2, %rd13, %rd18;
78
+ .loc 1 31 35
79
+ mov.u32 %r10, 0x0;
80
+ mov.u32 %r11, 0x0;
81
+ mov.u32 %r12, 0x0;
82
+ mov.u32 %r13, 0x0;
83
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
84
+ @!%p1 mov.u32 %r10, %r4;
85
+ @!%p1 mov.u32 %r11, %r4;
86
+ @!%p1 mov.u32 %r12, %r4;
87
+ @!%p1 mov.u32 %r13, %r4;
88
+ mov.b32 %f5, %r10;
89
+ mov.b32 %f6, %r11;
90
+ mov.b32 %f7, %r12;
91
+ mov.b32 %f8, %r13;
92
+ .loc 1 32 30
93
+ mul.wide.s32 %rd19, %r77, 4;
94
+ add.s64 %rd3, %rd14, %rd19;
95
+ .loc 1 32 46
96
+ mov.u32 %r18, 0x0;
97
+ mov.u32 %r19, 0x0;
98
+ mov.u32 %r20, 0x0;
99
+ mov.u32 %r21, 0x0;
100
+ @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
101
+ @!%p1 mov.u32 %r18, %r4;
102
+ @!%p1 mov.u32 %r19, %r4;
103
+ @!%p1 mov.u32 %r20, %r4;
104
+ @!%p1 mov.u32 %r21, %r4;
105
+ mov.b32 %f9, %r18;
106
+ mov.b32 %f10, %r19;
107
+ mov.b32 %f11, %r20;
108
+ mov.b32 %f12, %r21;
109
+ .loc 1 33 35
110
+ add.s64 %rd4, %rd11, %rd19;
111
+ .loc 1 33 51
112
+ mov.u32 %r26, 0x0;
113
+ mov.u32 %r27, 0x0;
114
+ mov.u32 %r28, 0x0;
115
+ mov.u32 %r29, 0x0;
116
+ @%p1 ld.global.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
117
+ @!%p1 mov.u32 %r26, %r4;
118
+ @!%p1 mov.u32 %r27, %r4;
119
+ @!%p1 mov.u32 %r28, %r4;
120
+ @!%p1 mov.u32 %r29, %r4;
121
+ mov.b32 %f13, %r26;
122
+ mov.b32 %f14, %r27;
123
+ mov.b32 %f15, %r28;
124
+ mov.b32 %f16, %r29;
125
+ .loc 1 34 31
126
+ mul.wide.s32 %rd20, %r1, 4;
127
+ add.s64 %rd5, %rd15, %rd20;
128
+ .loc 1 34 36
129
+ mov.u32 %r51, 0x0;
130
+ @%p1 ld.global.L1::evict_last.b32 { %r51 }, [ %rd5 + 0 ];
131
+ mov.u32 %r35, 0x0;
132
+ @%p1 ld.global.L1::evict_last.b32 { %r35 }, [ %rd5 + 0 ];
133
+ mov.u32 %r36, 0x0;
134
+ @%p1 ld.global.L1::evict_last.b32 { %r36 }, [ %rd5 + 0 ];
135
+ mov.u32 %r37, 0x0;
136
+ @%p1 ld.global.L1::evict_last.b32 { %r37 }, [ %rd5 + 0 ];
137
+ .loc 1 36 18
138
+ mul.f32 %f17, %f1, %f5;
139
+ mul.f32 %f18, %f2, %f6;
140
+ mul.f32 %f19, %f3, %f7;
141
+ mul.f32 %f20, %f4, %f8;
142
+ $L__tmp1:
143
+ .loc 2 233 15
144
+ fma.rn.f32 %f21, %f1, %f5, %f18;
145
+ fma.rn.f32 %f22, %f3, %f7, %f21;
146
+ fma.rn.f32 %f23, %f4, %f8, %f22;
147
+ $L__tmp2:
148
+ .loc 2 243 36
149
+ mov.b32 %r78, %f23;
150
+ shfl.sync.bfly.b32 %r79, %r78, 16, 31, -1;
151
+ mov.b32 %f24, %r79;
152
+ $L__tmp3:
153
+ .loc 2 233 15
154
+ add.f32 %f25, %f23, %f24;
155
+ $L__tmp4:
156
+ .loc 2 243 36
157
+ mov.b32 %r80, %f25;
158
+ shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1;
159
+ mov.b32 %f26, %r81;
160
+ $L__tmp5:
161
+ .loc 2 233 15
162
+ add.f32 %f27, %f25, %f26;
163
+ $L__tmp6:
164
+ .loc 2 243 36
165
+ mov.b32 %r82, %f27;
166
+ shfl.sync.bfly.b32 %r83, %r82, 4, 31, -1;
167
+ mov.b32 %f28, %r83;
168
+ $L__tmp7:
169
+ .loc 2 233 15
170
+ add.f32 %f29, %f27, %f28;
171
+ $L__tmp8:
172
+ .loc 2 243 36
173
+ mov.b32 %r84, %f29;
174
+ shfl.sync.bfly.b32 %r85, %r84, 2, 31, -1;
175
+ mov.b32 %f30, %r85;
176
+ $L__tmp9:
177
+ .loc 2 233 15
178
+ add.f32 %f31, %f29, %f30;
179
+ $L__tmp10:
180
+ .loc 2 243 36
181
+ mov.b32 %r86, %f31;
182
+ shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
183
+ mov.b32 %f32, %r87;
184
+ $L__tmp11:
185
+ .loc 2 233 15
186
+ add.f32 %f33, %f31, %f32;
187
+ $L__tmp12:
188
+ .loc 2 243 36
189
+ setp.eq.s32 %p23, %r73, 0;
190
+ shr.u32 %r88, %r72, 3;
191
+ and.b32 %r89, %r88, 4;
192
+ mov.u32 %r90, global_smem;
193
+ add.s32 %r38, %r90, %r89;
194
+ mov.b32 %r39, %f33;
195
+ @%p23 st.shared.b32 [ %r38 + 0 ], %r39;
196
+ bar.sync 0;
197
+ setp.lt.s32 %p24, %r72, 2;
198
+ add.s32 %r41, %r90, %r74;
199
+ @%p24 ld.shared.b32 %r40, [ %r41 + 0 ];
200
+ mov.b32 %f34, %r40;
201
+ shfl.sync.bfly.b32 %r91, %r40, 1, 31, -1;
202
+ mov.b32 %f35, %r91;
203
+ $L__tmp13:
204
+ .loc 2 233 15
205
+ add.f32 %f36, %f34, %f35;
206
+ $L__tmp14:
207
+ .loc 2 243 36
208
+ and.b32 %r92, %r72, 1;
209
+ setp.eq.b32 %p31, %r92, 1;
210
+ not.pred %p32, %p31;
211
+ and.pred %p25, %p24, %p32;
212
+ mov.b32 %r43, %f36;
213
+ @%p25 st.shared.b32 [ %r41 + 0 ], %r43;
214
+ bar.sync 0;
215
+ ld.shared.f32 %f37, [global_smem];
216
+ $L__tmp15:
217
+ .loc 3 8 15
218
+ add.f32 %f38, %f37, 0f00000000;
219
+ $L__tmp16:
220
+ .loc 1 40 18
221
+ mul.f32 %f39, %f18, %f10;
222
+ $L__tmp17:
223
+ .loc 2 243 36
224
+ bar.sync 0;
225
+ $L__tmp18:
226
+ .loc 2 233 15
227
+ fma.rn.f32 %f40, %f17, %f9, %f39;
228
+ fma.rn.f32 %f41, %f19, %f11, %f40;
229
+ fma.rn.f32 %f42, %f20, %f12, %f41;
230
+ $L__tmp19:
231
+ .loc 2 243 36
232
+ mov.b32 %r93, %f42;
233
+ shfl.sync.bfly.b32 %r94, %r93, 16, 31, -1;
234
+ mov.b32 %f43, %r94;
235
+ $L__tmp20:
236
+ .loc 2 233 15
237
+ add.f32 %f44, %f42, %f43;
238
+ $L__tmp21:
239
+ .loc 2 243 36
240
+ mov.b32 %r95, %f44;
241
+ shfl.sync.bfly.b32 %r96, %r95, 8, 31, -1;
242
+ mov.b32 %f45, %r96;
243
+ $L__tmp22:
244
+ .loc 2 233 15
245
+ add.f32 %f46, %f44, %f45;
246
+ $L__tmp23:
247
+ .loc 2 243 36
248
+ mov.b32 %r97, %f46;
249
+ shfl.sync.bfly.b32 %r98, %r97, 4, 31, -1;
250
+ mov.b32 %f47, %r98;
251
+ $L__tmp24:
252
+ .loc 2 233 15
253
+ add.f32 %f48, %f46, %f47;
254
+ $L__tmp25:
255
+ .loc 2 243 36
256
+ mov.b32 %r99, %f48;
257
+ shfl.sync.bfly.b32 %r100, %r99, 2, 31, -1;
258
+ mov.b32 %f49, %r100;
259
+ $L__tmp26:
260
+ .loc 2 233 15
261
+ add.f32 %f50, %f48, %f49;
262
+ $L__tmp27:
263
+ .loc 2 243 36
264
+ mov.b32 %r101, %f50;
265
+ shfl.sync.bfly.b32 %r102, %r101, 1, 31, -1;
266
+ mov.b32 %f51, %r102;
267
+ $L__tmp28:
268
+ .loc 2 233 15
269
+ add.f32 %f52, %f50, %f51;
270
+ $L__tmp29:
271
+ .loc 2 243 36
272
+ mov.b32 %r45, %f52;
273
+ @%p23 st.shared.b32 [ %r38 + 0 ], %r45;
274
+ bar.sync 0;
275
+ @%p24 ld.shared.b32 %r46, [ %r41 + 0 ];
276
+ mov.b32 %f53, %r46;
277
+ shfl.sync.bfly.b32 %r103, %r46, 1, 31, -1;
278
+ mov.b32 %f54, %r103;
279
+ $L__tmp30:
280
+ .loc 2 233 15
281
+ add.f32 %f55, %f53, %f54;
282
+ $L__tmp31:
283
+ .loc 2 243 36
284
+ mov.b32 %r49, %f55;
285
+ @%p25 st.shared.b32 [ %r41 + 0 ], %r49;
286
+ bar.sync 0;
287
+ ld.shared.f32 %f56, [global_smem];
288
+ $L__tmp32:
289
+ .loc 3 8 15
290
+ add.f32 %f57, %f56, 0f00000000;
291
+ mov.b32 %r52, 1132462080;
292
+ $L__tmp33:
293
+ .loc 1 45 20
294
+ div.full.f32 %r50, %r51, %r52;
295
+ mov.b32 %f58, %r50;
296
+ .loc 1 47 20
297
+ neg.f32 %f59, %f38;
298
+ fma.rn.f32 %f60, %f17, 0f43800000, %f59;
299
+ fma.rn.f32 %f61, %f18, 0f43800000, %f59;
300
+ fma.rn.f32 %f62, %f19, 0f43800000, %f59;
301
+ fma.rn.f32 %f63, %f20, 0f43800000, %f59;
302
+ .loc 1 49 20
303
+ neg.f32 %f64, %f57;
304
+ fma.rn.f32 %f65, %f64, %f9, %f60;
305
+ fma.rn.f32 %f66, %f64, %f10, %f61;
306
+ fma.rn.f32 %f67, %f64, %f11, %f62;
307
+ fma.rn.f32 %f68, %f64, %f12, %f63;
308
+ .loc 1 51 20
309
+ fma.rn.f32 %f69, %f58, %f65, %f13;
310
+ fma.rn.f32 %f70, %f58, %f66, %f14;
311
+ fma.rn.f32 %f71, %f58, %f67, %f15;
312
+ fma.rn.f32 %f72, %f58, %f68, %f16;
313
+ .loc 1 53 51
314
+ mov.b32 %r62, %f69;
315
+ mov.b32 %r63, %f70;
316
+ mov.b32 %r64, %f71;
317
+ mov.b32 %r65, %f72;
318
+ @%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r62, %r63, %r64, %r65 };
319
+ .loc 1 54 25
320
+ add.s64 %rd10, %rd16, %rd17;
321
+ .loc 1 54 48
322
+ cvt.rn.bf16.f32 %rs5, %r62;
323
+ cvt.rn.bf16.f32 %rs6, %r63;
324
+ cvt.rn.bf16.f32 %rs7, %r64;
325
+ cvt.rn.bf16.f32 %rs8, %r65;
326
+ mov.b32 %r104, {%rs5, %rs6};
327
+ mov.b32 %r105, {%rs7, %rs8};
328
+ @%p1 st.global.v2.b32 [ %rd10 + 0 ], { %r104, %r105 };
329
+ .loc 1 54 4
330
+ ret;
331
+ $L__tmp34:
332
+ $L__func_end0:
333
+
334
+ }
335
+ .file 1 "/tmp/torchinductor_root/rn/crnynbmsd2yell2lpjymb46rttfaea2xjwsbxr75j54gctfgi457.py"
336
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
337
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
338
+ .section .debug_abbrev
339
+ {
340
+ .b8 1
341
+ .b8 17
342
+ .b8 1
343
+ .b8 37
344
+ .b8 8
345
+ .b8 19
346
+ .b8 5
347
+ .b8 3
348
+ .b8 8
349
+ .b8 16
350
+ .b8 6
351
+ .b8 27
352
+ .b8 8
353
+ .b8 180
354
+ .b8 66
355
+ .b8 12
356
+ .b8 17
357
+ .b8 1
358
+ .b8 18
359
+ .b8 1
360
+ .b8 0
361
+ .b8 0
362
+ .b8 2
363
+ .b8 46
364
+ .b8 0
365
+ .b8 135
366
+ .b8 64
367
+ .b8 8
368
+ .b8 3
369
+ .b8 8
370
+ .b8 58
371
+ .b8 11
372
+ .b8 59
373
+ .b8 11
374
+ .b8 63
375
+ .b8 12
376
+ .b8 32
377
+ .b8 11
378
+ .b8 0
379
+ .b8 0
380
+ .b8 3
381
+ .b8 46
382
+ .b8 1
383
+ .b8 17
384
+ .b8 1
385
+ .b8 18
386
+ .b8 1
387
+ .b8 64
388
+ .b8 10
389
+ .b8 49
390
+ .b8 19
391
+ .b8 0
392
+ .b8 0
393
+ .b8 4
394
+ .b8 29
395
+ .b8 1
396
+ .b8 49
397
+ .b8 19
398
+ .b8 17
399
+ .b8 1
400
+ .b8 18
401
+ .b8 1
402
+ .b8 88
403
+ .b8 11
404
+ .b8 89
405
+ .b8 11
406
+ .b8 87
407
+ .b8 11
408
+ .b8 0
409
+ .b8 0
410
+ .b8 5
411
+ .b8 29
412
+ .b8 0
413
+ .b8 49
414
+ .b8 19
415
+ .b8 17
416
+ .b8 1
417
+ .b8 18
418
+ .b8 1
419
+ .b8 88
420
+ .b8 11
421
+ .b8 89
422
+ .b8 11
423
+ .b8 87
424
+ .b8 11
425
+ .b8 0
426
+ .b8 0
427
+ .b8 0
428
+ }
429
+ .section .debug_info
430
+ {
431
+ .b32 399
432
+ .b8 2
433
+ .b8 0
434
+ .b32 .debug_abbrev
435
+ .b8 8
436
+ .b8 1
437
+ .b8 116
438
+ .b8 114
439
+ .b8 105
440
+ .b8 116
441
+ .b8 111
442
+ .b8 110
443
+ .b8 0
444
+ .b8 2
445
+ .b8 0
446
+ .b8 99
447
+ .b8 114
448
+ .b8 110
449
+ .b8 121
450
+ .b8 110
451
+ .b8 98
452
+ .b8 109
453
+ .b8 115
454
+ .b8 100
455
+ .b8 50
456
+ .b8 121
457
+ .b8 101
458
+ .b8 108
459
+ .b8 108
460
+ .b8 50
461
+ .b8 108
462
+ .b8 112
463
+ .b8 106
464
+ .b8 121
465
+ .b8 109
466
+ .b8 98
467
+ .b8 52
468
+ .b8 54
469
+ .b8 114
470
+ .b8 116
471
+ .b8 116
472
+ .b8 102
473
+ .b8 97
474
+ .b8 101
475
+ .b8 97
476
+ .b8 50
477
+ .b8 120
478
+ .b8 106
479
+ .b8 119
480
+ .b8 115
481
+ .b8 98
482
+ .b8 120
483
+ .b8 114
484
+ .b8 55
485
+ .b8 53
486
+ .b8 106
487
+ .b8 53
488
+ .b8 52
489
+ .b8 103
490
+ .b8 99
491
+ .b8 116
492
+ .b8 102
493
+ .b8 103
494
+ .b8 105
495
+ .b8 52
496
+ .b8 53
497
+ .b8 55
498
+ .b8 46
499
+ .b8 112
500
+ .b8 121
501
+ .b8 0
502
+ .b32 .debug_line
503
+ .b8 47
504
+ .b8 116
505
+ .b8 109
506
+ .b8 112
507
+ .b8 47
508
+ .b8 116
509
+ .b8 111
510
+ .b8 114
511
+ .b8 99
512
+ .b8 104
513
+ .b8 105
514
+ .b8 110
515
+ .b8 100
516
+ .b8 117
517
+ .b8 99
518
+ .b8 116
519
+ .b8 111
520
+ .b8 114
521
+ .b8 95
522
+ .b8 114
523
+ .b8 111
524
+ .b8 111
525
+ .b8 116
526
+ .b8 47
527
+ .b8 114
528
+ .b8 110
529
+ .b8 0
530
+ .b8 1
531
+ .b64 $L__func_begin0
532
+ .b64 $L__func_end0
533
+ .b8 2
534
+ .b8 116
535
+ .b8 114
536
+ .b8 105
537
+ .b8 116
538
+ .b8 111
539
+ .b8 110
540
+ .b8 95
541
+ .b8 95
542
+ .b8 48
543
+ .b8 100
544
+ .b8 49
545
+ .b8 100
546
+ .b8 50
547
+ .b8 100
548
+ .b8 51
549
+ .b8 100
550
+ .b8 52
551
+ .b8 100
552
+ .b8 53
553
+ .b8 100
554
+ .b8 54
555
+ .b8 100
556
+ .b8 101
557
+ .b8 55
558
+ .b8 100
559
+ .b8 101
560
+ .b8 0
561
+ .b8 116
562
+ .b8 114
563
+ .b8 105
564
+ .b8 116
565
+ .b8 111
566
+ .b8 110
567
+ .b8 95
568
+ .b8 95
569
+ .b8 48
570
+ .b8 100
571
+ .b8 49
572
+ .b8 100
573
+ .b8 50
574
+ .b8 100
575
+ .b8 51
576
+ .b8 100
577
+ .b8 52
578
+ .b8 100
579
+ .b8 53
580
+ .b8 100
581
+ .b8 54
582
+ .b8 100
583
+ .b8 101
584
+ .b8 55
585
+ .b8 100
586
+ .b8 101
587
+ .b8 0
588
+ .b8 1
589
+ .b8 18
590
+ .b8 1
591
+ .b8 1
592
+ .b8 3
593
+ .b64 $L__func_begin0
594
+ .b64 $L__func_end0
595
+ .b8 1
596
+ .b8 156
597
+ .b32 125
598
+ .b8 4
599
+ .b32 125
600
+ .b64 $L__tmp1
601
+ .b64 $L__tmp14
602
+ .b8 2
603
+ .b8 39
604
+ .b8 57
605
+ .b8 5
606
+ .b32 125
607
+ .b64 $L__tmp1
608
+ .b64 $L__tmp14
609
+ .b8 2
610
+ .b8 243
611
+ .b8 36
612
+ .b8 0
613
+ .b8 5
614
+ .b32 125
615
+ .b64 $L__tmp2
616
+ .b64 $L__tmp15
617
+ .b8 2
618
+ .b8 39
619
+ .b8 57
620
+ .b8 5
621
+ .b32 125
622
+ .b64 $L__tmp15
623
+ .b64 $L__tmp16
624
+ .b8 3
625
+ .b8 39
626
+ .b8 44
627
+ .b8 5
628
+ .b32 125
629
+ .b64 $L__tmp17
630
+ .b64 $L__tmp32
631
+ .b8 2
632
+ .b8 43
633
+ .b8 59
634
+ .b8 4
635
+ .b32 125
636
+ .b64 $L__tmp18
637
+ .b64 $L__tmp31
638
+ .b8 2
639
+ .b8 43
640
+ .b8 59
641
+ .b8 5
642
+ .b32 125
643
+ .b64 $L__tmp18
644
+ .b64 $L__tmp31
645
+ .b8 2
646
+ .b8 243
647
+ .b8 36
648
+ .b8 0
649
+ .b8 5
650
+ .b32 125
651
+ .b64 $L__tmp32
652
+ .b64 $L__tmp33
653
+ .b8 3
654
+ .b8 43
655
+ .b8 45
656
+ .b8 0
657
+ .b8 0
658
+ }
659
+ .section .debug_pubnames
660
+ {
661
+ .b32 $L__pubNames_end0-$L__pubNames_start0
662
+ $L__pubNames_start0:
663
+ .b8 2
664
+ .b8 0
665
+ .b32 .debug_info
666
+ .b32 403
667
+ .b32 125
668
+ .b8 116
669
+ .b8 114
670
+ .b8 105
671
+ .b8 116
672
+ .b8 111
673
+ .b8 110
674
+ .b8 95
675
+ .b8 95
676
+ .b8 48
677
+ .b8 100
678
+ .b8 49
679
+ .b8 100
680
+ .b8 50
681
+ .b8 100
682
+ .b8 51
683
+ .b8 100
684
+ .b8 52
685
+ .b8 100
686
+ .b8 53
687
+ .b8 100
688
+ .b8 54
689
+ .b8 100
690
+ .b8 101
691
+ .b8 55
692
+ .b8 100
693
+ .b8 101
694
+ .b8 0
695
+ .b32 0
696
+ $L__pubNames_end0:
697
+ }
698
+ .section .debug_pubtypes
699
+ {
700
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
701
+ $L__pubTypes_start0:
702
+ .b8 2
703
+ .b8 0
704
+ .b32 .debug_info
705
+ .b32 403
706
+ .b32 0
707
+ $L__pubTypes_end0:
708
+ }
709
+ .section .debug_loc { }
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
6
+ %cst_1 = arith.constant 0.000000e+00 : f32
7
+ %c256_i32 = arith.constant 256 : i32
8
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
9
+ %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
20
+ %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
21
+ %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
22
+ %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
23
+ %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
27
+ %16 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
28
+ %17 = tt.addptr %16, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
29
+ %18 = tt.load %17, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
30
+ %19 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
31
+ %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
32
+ %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
33
+ %22 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
34
+ %23 = arith.select %2, %22, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
35
+ %24 = "tt.reduce"(%23) <{axis = 0 : i32}> ({
36
+ ^bb0(%arg8: f32, %arg9: f32):
37
+ %43 = arith.addf %arg8, %arg9 : f32
38
+ tt.reduce.return %43 : f32
39
+ }) : (tensor<256xf32, #blocked>) -> f32
40
+ %25 = arith.addf %24, %cst_1 : f32
41
+ %26 = arith.mulf %22, %15 : tensor<256xf32, #blocked>
42
+ %27 = arith.select %2, %26, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
43
+ %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
44
+ ^bb0(%arg8: f32, %arg9: f32):
45
+ %43 = arith.addf %arg8, %arg9 : f32
46
+ tt.reduce.return %43 : f32
47
+ }) : (tensor<256xf32, #blocked>) -> f32
48
+ %29 = arith.addf %28, %cst_1 : f32
49
+ %30 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked>
50
+ %31 = arith.mulf %22, %cst_3 : tensor<256xf32, #blocked>
51
+ %32 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
52
+ %33 = arith.subf %31, %32 : tensor<256xf32, #blocked>
53
+ %34 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked>
54
+ %35 = arith.mulf %15, %34 : tensor<256xf32, #blocked>
55
+ %36 = arith.subf %33, %35 : tensor<256xf32, #blocked>
56
+ %37 = tt.broadcast %30 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
57
+ %38 = arith.mulf %37, %36 : tensor<256xf32, #blocked>
58
+ %39 = arith.addf %18, %38 : tensor<256xf32, #blocked>
59
+ tt.store %17, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
60
+ %40 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
61
+ %41 = tt.addptr %40, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
62
+ %42 = arith.truncf %39 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
63
+ tt.store %41, %42, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
64
+ tt.return
65
+ }
66
+ }
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin ADDED
Binary file (4.65 kB). View file
 
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
4
+ %c1024_i32 = arith.constant 1024 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c1024_i32 : i32
7
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
8
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
9
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
10
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
11
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
12
+ tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
13
+ tt.return
14
+ }
15
+ }
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin ADDED
Binary file (28.6 kB). View file
 
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.398942292> : tensor<1024xf32, #blocked>
5
+ %cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32, #blocked>
6
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked>
7
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked>
8
+ %cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked>
9
+ %c1024_i32 = arith.constant 1024 : i32
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = arith.muli %0, %c1024_i32 : i32
12
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
13
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
14
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
15
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
16
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
17
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
18
+ %8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
20
+ %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
21
+ %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
22
+ %12 = arith.extf %11 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
23
+ %13 = arith.mulf %12, %cst_3 : tensor<1024xf32, #blocked>
24
+ %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
25
+ %15 = arith.addf %14, %cst_2 : tensor<1024xf32, #blocked>
26
+ %16 = arith.mulf %15, %cst_1 : tensor<1024xf32, #blocked>
27
+ %17 = arith.mulf %12, %12 : tensor<1024xf32, #blocked>
28
+ %18 = arith.mulf %17, %cst_0 : tensor<1024xf32, #blocked>
29
+ %19 = math.exp %18 : tensor<1024xf32, #blocked>
30
+ %20 = arith.mulf %19, %cst : tensor<1024xf32, #blocked>
31
+ %21 = arith.mulf %12, %20 : tensor<1024xf32, #blocked>
32
+ %22 = arith.addf %16, %21 : tensor<1024xf32, #blocked>
33
+ %23 = arith.mulf %8, %22 : tensor<1024xf32, #blocked>
34
+ %24 = arith.truncf %23 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
35
+ tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
36
+ tt.return
37
+ }
38
+ }
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.398942292> : tensor<1024xf32>
4
+ %cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32>
5
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
6
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32>
7
+ %cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32>
8
+ %c1024_i32 = arith.constant 1024 : i32
9
+ %0 = tt.get_program_id x : i32
10
+ %1 = arith.muli %0, %c1024_i32 : i32
11
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
12
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
13
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
14
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
15
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
16
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
17
+ %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
18
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
19
+ %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
20
+ %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
21
+ %12 = arith.extf %11 : tensor<1024xbf16> to tensor<1024xf32>
22
+ %13 = arith.mulf %12, %cst_3 : tensor<1024xf32>
23
+ %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
24
+ %15 = arith.addf %14, %cst_2 : tensor<1024xf32>
25
+ %16 = arith.mulf %15, %cst_1 : tensor<1024xf32>
26
+ %17 = arith.mulf %12, %12 : tensor<1024xf32>
27
+ %18 = arith.mulf %17, %cst_0 : tensor<1024xf32>
28
+ %19 = math.exp %18 : tensor<1024xf32>
29
+ %20 = arith.mulf %19, %cst : tensor<1024xf32>
30
+ %21 = arith.mulf %12, %20 : tensor<1024xf32>
31
+ %22 = arith.addf %16, %21 : tensor<1024xf32>
32
+ %23 = arith.mulf %8, %22 : tensor<1024xf32>
33
+ %24 = arith.truncf %23 : tensor<1024xf32> to tensor<1024xbf16>
34
+ tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
35
+ tt.return
36
+ }
37
+ }
.triton/dump/962d1809855a53123762906133b1d960/triton_.ttir ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
4
+ %cst_0 = arith.constant dense<12865792> : tensor<1024xi32>
5
+ %c1024_i32 = arith.constant 1024 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c1024_i32 : i32
8
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
9
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
10
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
11
+ %5 = arith.cmpi slt, %4, %cst_0 : tensor<1024xi32>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
13
+ %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
14
+ tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
15
+ tt.return
16
+ }
17
+ }
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6d7d8d9d10de11de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, i32 %10, i32 %11) local_unnamed_addr !dbg !5 {
7
+ %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %14 = and i32 %13, 31, !dbg !8
9
+ %15 = lshr i32 %13, 5, !dbg !8
10
+ %16 = shl i32 %13, 2, !dbg !8
11
+ %17 = and i32 %16, 60, !dbg !8
12
+ %18 = and i32 %15, 3, !dbg !8
13
+ %19 = lshr i32 %14, 1, !dbg !8
14
+ %20 = shl nuw nsw i32 %18, 4, !dbg !8
15
+ %21 = or i32 %20, %19, !dbg !8
16
+ %22 = and i32 %16, 4, !dbg !9
17
+ %23 = lshr i32 %14, 4, !dbg !9
18
+ %24 = shl nuw nsw i32 %18, 1, !dbg !9
19
+ %25 = or i32 %24, %23, !dbg !9
20
+ %26 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
21
+ %27 = shl i32 %26, 6, !dbg !11
22
+ %28 = or i32 %27, %17, !dbg !12
23
+ %29 = or i32 %27, %21, !dbg !12
24
+ %.frozen = freeze i32 %28
25
+ %30 = sdiv i32 %.frozen, 256, !dbg !13
26
+ %31 = mul i32 %30, 256
27
+ %.decomposed = sub i32 %.frozen, %31
28
+ %32 = sdiv i32 %29, 256, !dbg !13
29
+ %33 = shl i32 %30, 15, !dbg !14
30
+ %34 = shl nsw i32 %32, 7, !dbg !15
31
+ %35 = add i32 %33, %.decomposed
32
+ %36 = mul nuw nsw i32 %17, 12
33
+ %37 = or i32 %25, %36
34
+ %38 = zext nneg i32 %37 to i64
35
+ %39 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %38
36
+ %40 = or i32 %36, 12
37
+ %41 = add nuw nsw i32 %40, %25
38
+ %42 = zext nneg i32 %41 to i64
39
+ %43 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %42
40
+ %44 = add nuw nsw i32 %36, 24
41
+ %45 = or i32 %44, %25
42
+ %46 = zext nneg i32 %45 to i64
43
+ %47 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %46
44
+ %48 = add nuw nsw i32 %36, 36
45
+ %49 = add nuw nsw i32 %48, %25
46
+ %50 = zext nneg i32 %49 to i64
47
+ %51 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %50
48
+ %52 = mul nuw nsw i32 %21, 12
49
+ %53 = add nuw nsw i32 %52, %22
50
+ %54 = zext nneg i32 %53 to i64
51
+ %55 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %54
52
+ %56 = getelementptr float, ptr addrspace(3) @global_smem, i64 %38
53
+ %57 = getelementptr float, ptr addrspace(3) @global_smem, i64 %42
54
+ %58 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46
55
+ %59 = getelementptr float, ptr addrspace(3) @global_smem, i64 %50
56
+ %60 = getelementptr float, ptr addrspace(3) @global_smem, i64 %54
57
+ %61 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 1
58
+ %62 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 2
59
+ %63 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 3
60
+ br label %64, !dbg !16
61
+
62
+ 64: ; preds = %12, %64
63
+ %65 = phi i32 [ 0, %12 ], [ %205, %64 ]
64
+ %66 = phi <8 x float> [ zeroinitializer, %12 ], [ %204, %64 ]
65
+ %67 = or i32 %65, %22, !dbg !17
66
+ %68 = or i32 %65, %25, !dbg !17
67
+ %69 = shl i32 %68, 8, !dbg !18
68
+ %70 = add i32 %35, %69, !dbg !19
69
+ %71 = sext i32 %70 to i64, !dbg !20
70
+ %72 = getelementptr i16, ptr addrspace(1) %0, i64 %71, !dbg !20
71
+ %73 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %72, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
72
+ %74 = extractvalue { i32, i32 } %73, 0, !dbg !21
73
+ %75 = extractvalue { i32, i32 } %73, 1, !dbg !21
74
+ %76 = trunc i32 %74 to i16, !dbg !21
75
+ %extelt.offset = lshr i32 %74, 16, !dbg !21
76
+ %77 = trunc i32 %extelt.offset to i16, !dbg !21
77
+ %78 = trunc i32 %75 to i16, !dbg !21
78
+ %extelt.offset1 = lshr i32 %75, 16, !dbg !21
79
+ %79 = trunc i32 %extelt.offset1 to i16, !dbg !21
80
+ tail call void @llvm.nvvm.barrier0(), !dbg !22
81
+ %80 = insertelement <1 x i16> undef, i16 %76, i64 0, !dbg !22
82
+ store <1 x i16> %80, ptr addrspace(3) %39, align 2, !dbg !22
83
+ %81 = insertelement <1 x i16> undef, i16 %77, i64 0, !dbg !22
84
+ store <1 x i16> %81, ptr addrspace(3) %43, align 2, !dbg !22
85
+ %82 = insertelement <1 x i16> undef, i16 %78, i64 0, !dbg !22
86
+ store <1 x i16> %82, ptr addrspace(3) %47, align 2, !dbg !22
87
+ %83 = insertelement <1 x i16> undef, i16 %79, i64 0, !dbg !22
88
+ store <1 x i16> %83, ptr addrspace(3) %51, align 2, !dbg !22
89
+ tail call void @llvm.nvvm.barrier0(), !dbg !22
90
+ %84 = load i16, ptr addrspace(3) %55, align 8, !dbg !22
91
+ %85 = load i16, ptr addrspace(3) %61, align 2, !dbg !22
92
+ %86 = load i16, ptr addrspace(3) %62, align 4, !dbg !22
93
+ %87 = load i16, ptr addrspace(3) %63, align 2, !dbg !22
94
+ %88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #3, !dbg !22
95
+ %89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #3, !dbg !22
96
+ %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #3, !dbg !22
97
+ %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %87) #3, !dbg !22
98
+ %92 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !23
99
+ %93 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %92, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24
100
+ %94 = extractvalue { i32, i32, i32, i32 } %93, 0, !dbg !24
101
+ %95 = extractvalue { i32, i32, i32, i32 } %93, 1, !dbg !24
102
+ %96 = extractvalue { i32, i32, i32, i32 } %93, 2, !dbg !24
103
+ %97 = extractvalue { i32, i32, i32, i32 } %93, 3, !dbg !24
104
+ %98 = bitcast i32 %94 to float, !dbg !24
105
+ %99 = bitcast i32 %95 to float, !dbg !24
106
+ %100 = bitcast i32 %96 to float, !dbg !24
107
+ %101 = bitcast i32 %97 to float, !dbg !24
108
+ tail call void @llvm.nvvm.barrier0(), !dbg !24
109
+ %102 = insertelement <1 x float> undef, float %98, i64 0, !dbg !24
110
+ store <1 x float> %102, ptr addrspace(3) %56, align 4, !dbg !24
111
+ %103 = insertelement <1 x float> undef, float %99, i64 0, !dbg !24
112
+ store <1 x float> %103, ptr addrspace(3) %57, align 4, !dbg !24
113
+ %104 = insertelement <1 x float> undef, float %100, i64 0, !dbg !24
114
+ store <1 x float> %104, ptr addrspace(3) %58, align 4, !dbg !24
115
+ %105 = insertelement <1 x float> undef, float %101, i64 0, !dbg !24
116
+ store <1 x float> %105, ptr addrspace(3) %59, align 4, !dbg !24
117
+ tail call void @llvm.nvvm.barrier0(), !dbg !24
118
+ %106 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !24
119
+ %107 = getelementptr i16, ptr addrspace(1) %2, i64 %71, !dbg !25
120
+ %108 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !26
121
+ %109 = extractvalue { i32, i32 } %108, 0, !dbg !26
122
+ %110 = extractvalue { i32, i32 } %108, 1, !dbg !26
123
+ %111 = trunc i32 %109 to i16, !dbg !26
124
+ %extelt.offset2 = lshr i32 %109, 16, !dbg !26
125
+ %112 = trunc i32 %extelt.offset2 to i16, !dbg !26
126
+ %113 = trunc i32 %110 to i16, !dbg !26
127
+ %extelt.offset3 = lshr i32 %110, 16, !dbg !26
128
+ %114 = trunc i32 %extelt.offset3 to i16, !dbg !26
129
+ %115 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #3, !dbg !27
130
+ %116 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #3, !dbg !27
131
+ %117 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #3, !dbg !27
132
+ %118 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #3, !dbg !27
133
+ %119 = add i32 %67, %34, !dbg !28
134
+ %120 = sext i32 %119 to i64, !dbg !29
135
+ %121 = getelementptr float, ptr addrspace(1) %3, i64 %120, !dbg !29
136
+ %122 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %121, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !30
137
+ %123 = extractvalue { i32, i32, i32, i32 } %122, 0, !dbg !30
138
+ %124 = extractvalue { i32, i32, i32, i32 } %122, 1, !dbg !30
139
+ %125 = extractvalue { i32, i32, i32, i32 } %122, 2, !dbg !30
140
+ %126 = extractvalue { i32, i32, i32, i32 } %122, 3, !dbg !30
141
+ %127 = getelementptr float, ptr addrspace(1) %4, i64 %120, !dbg !31
142
+ %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %127, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !32
143
+ %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !32
144
+ %130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !32
145
+ %131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !32
146
+ %132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !32
147
+ %133 = getelementptr i16, ptr addrspace(1) %5, i64 %71, !dbg !33
148
+ %134 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %133, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !34
149
+ %135 = extractvalue { i32, i32 } %134, 0, !dbg !34
150
+ %136 = extractvalue { i32, i32 } %134, 1, !dbg !34
151
+ %137 = trunc i32 %135 to i16, !dbg !34
152
+ %extelt.offset4 = lshr i32 %135, 16, !dbg !34
153
+ %138 = trunc i32 %extelt.offset4 to i16, !dbg !34
154
+ %139 = trunc i32 %136 to i16, !dbg !34
155
+ %extelt.offset5 = lshr i32 %136, 16, !dbg !34
156
+ %140 = trunc i32 %extelt.offset5 to i16, !dbg !34
157
+ tail call void @llvm.nvvm.barrier0(), !dbg !35
158
+ %141 = insertelement <1 x i16> undef, i16 %137, i64 0, !dbg !35
159
+ store <1 x i16> %141, ptr addrspace(3) %39, align 2, !dbg !35
160
+ %142 = insertelement <1 x i16> undef, i16 %138, i64 0, !dbg !35
161
+ store <1 x i16> %142, ptr addrspace(3) %43, align 2, !dbg !35
162
+ %143 = insertelement <1 x i16> undef, i16 %139, i64 0, !dbg !35
163
+ store <1 x i16> %143, ptr addrspace(3) %47, align 2, !dbg !35
164
+ %144 = insertelement <1 x i16> undef, i16 %140, i64 0, !dbg !35
165
+ store <1 x i16> %144, ptr addrspace(3) %51, align 2, !dbg !35
166
+ tail call void @llvm.nvvm.barrier0(), !dbg !35
167
+ %145 = load i16, ptr addrspace(3) %55, align 8, !dbg !35
168
+ %146 = load i16, ptr addrspace(3) %61, align 2, !dbg !35
169
+ %147 = load i16, ptr addrspace(3) %62, align 4, !dbg !35
170
+ %148 = load i16, ptr addrspace(3) %63, align 2, !dbg !35
171
+ %149 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %145) #3, !dbg !35
172
+ %150 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %146) #3, !dbg !35
173
+ %151 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %147) #3, !dbg !35
174
+ %152 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %148) #3, !dbg !35
175
+ %153 = getelementptr float, ptr addrspace(1) %6, i64 %120, !dbg !36
176
+ %154 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %153, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !37
177
+ %155 = extractvalue { i32, i32, i32, i32 } %154, 0, !dbg !37
178
+ %156 = extractvalue { i32, i32, i32, i32 } %154, 1, !dbg !37
179
+ %157 = extractvalue { i32, i32, i32, i32 } %154, 2, !dbg !37
180
+ %158 = extractvalue { i32, i32, i32, i32 } %154, 3, !dbg !37
181
+ %159 = getelementptr float, ptr addrspace(1) %7, i64 %120, !dbg !38
182
+ %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %159, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !39
183
+ %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !39
184
+ %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !39
185
+ %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !39
186
+ %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !39
187
+ %165 = fadd float %115, %98, !dbg !40
188
+ %166 = fadd float %116, %99, !dbg !40
189
+ %167 = fadd float %117, %100, !dbg !40
190
+ %168 = fadd float %118, %101, !dbg !40
191
+ tail call void @llvm.nvvm.barrier0(), !dbg !40
192
+ %169 = insertelement <1 x float> undef, float %165, i64 0, !dbg !40
193
+ store <1 x float> %169, ptr addrspace(3) %56, align 4, !dbg !40
194
+ %170 = insertelement <1 x float> undef, float %166, i64 0, !dbg !40
195
+ store <1 x float> %170, ptr addrspace(3) %57, align 4, !dbg !40
196
+ %171 = insertelement <1 x float> undef, float %167, i64 0, !dbg !40
197
+ store <1 x float> %171, ptr addrspace(3) %58, align 4, !dbg !40
198
+ %172 = insertelement <1 x float> undef, float %168, i64 0, !dbg !40
199
+ store <1 x float> %172, ptr addrspace(3) %59, align 4, !dbg !40
200
+ tail call void @llvm.nvvm.barrier0(), !dbg !40
201
+ %173 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !40
202
+ %174 = insertelement <8 x i32> poison, i32 %155, i64 0, !dbg !37
203
+ %175 = insertelement <8 x i32> %174, i32 %156, i64 1, !dbg !37
204
+ %176 = insertelement <8 x i32> %175, i32 %157, i64 2, !dbg !37
205
+ %177 = insertelement <8 x i32> %176, i32 %158, i64 3, !dbg !37
206
+ %178 = insertelement <8 x i32> %177, i32 %123, i64 4, !dbg !37
207
+ %179 = insertelement <8 x i32> %178, i32 %124, i64 5, !dbg !37
208
+ %180 = insertelement <8 x i32> %179, i32 %125, i64 6, !dbg !37
209
+ %181 = insertelement <8 x i32> %180, i32 %126, i64 7, !dbg !37
210
+ %182 = bitcast <8 x i32> %181 to <8 x float>, !dbg !37
211
+ %183 = insertelement <8 x i32> poison, i32 %161, i64 0, !dbg !39
212
+ %184 = insertelement <8 x i32> %183, i32 %162, i64 1, !dbg !39
213
+ %185 = insertelement <8 x i32> %184, i32 %163, i64 2, !dbg !39
214
+ %186 = insertelement <8 x i32> %185, i32 %164, i64 3, !dbg !39
215
+ %187 = insertelement <8 x i32> %186, i32 %129, i64 4, !dbg !39
216
+ %188 = insertelement <8 x i32> %187, i32 %130, i64 5, !dbg !39
217
+ %189 = insertelement <8 x i32> %188, i32 %131, i64 6, !dbg !39
218
+ %190 = insertelement <8 x i32> %189, i32 %132, i64 7, !dbg !39
219
+ %191 = bitcast <8 x i32> %190 to <8 x float>, !dbg !39
220
+ %192 = shufflevector <4 x float> %106, <4 x float> %173, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, !dbg !41
221
+ %193 = fsub <8 x float> %192, %182, !dbg !41
222
+ %194 = fmul <8 x float> %193, %191, !dbg !42
223
+ %195 = insertelement <8 x float> poison, float %149, i64 0, !dbg !43
224
+ %196 = insertelement <8 x float> %195, float %150, i64 1, !dbg !43
225
+ %197 = insertelement <8 x float> %196, float %151, i64 2, !dbg !43
226
+ %198 = insertelement <8 x float> %197, float %152, i64 3, !dbg !43
227
+ %199 = insertelement <8 x float> %198, float %88, i64 4, !dbg !43
228
+ %200 = insertelement <8 x float> %199, float %89, i64 5, !dbg !43
229
+ %201 = insertelement <8 x float> %200, float %90, i64 6, !dbg !43
230
+ %202 = insertelement <8 x float> %201, float %91, i64 7, !dbg !43
231
+ %203 = fmul <8 x float> %202, %194, !dbg !43
232
+ %204 = fadd <8 x float> %66, %203, !dbg !44
233
+ %205 = add nuw nsw i32 %65, 8, !dbg !16
234
+ %206 = icmp ult i32 %65, 120, !dbg !16
235
+ br i1 %206, label %64, label %207, !dbg !16
236
+
237
+ 207: ; preds = %64
238
+ %208 = and i32 %13, 63, !dbg !8
239
+ %209 = or i32 %27, %208, !dbg !12
240
+ %shift = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>, !dbg !45
241
+ %210 = fadd <8 x float> %204, %shift, !dbg !45
242
+ %shift28 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 6, i32 poison, i32 poison, i32 poison>, !dbg !45
243
+ %211 = fadd <8 x float> %shift28, %210, !dbg !45
244
+ %shift29 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison, i32 poison, i32 poison>, !dbg !45
245
+ %212 = fadd <8 x float> %shift29, %211, !dbg !45
246
+ %213 = extractelement <8 x float> %212, i64 4, !dbg !45
247
+ %214 = bitcast float %213 to i32, !dbg !51
248
+ %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !51
249
+ %216 = bitcast i32 %215 to float, !dbg !51
250
+ %217 = fadd float %213, %216, !dbg !45
251
+ tail call void @llvm.nvvm.barrier0(), !dbg !53
252
+ %218 = zext nneg i32 %21 to i64, !dbg !53
253
+ %219 = getelementptr float, ptr addrspace(3) @global_smem, i64 %218, !dbg !53
254
+ %220 = insertelement <1 x float> undef, float %217, i64 0, !dbg !53
255
+ store <1 x float> %220, ptr addrspace(3) %219, align 4, !dbg !53
256
+ tail call void @llvm.nvvm.barrier0(), !dbg !53
257
+ %221 = zext nneg i32 %208 to i64, !dbg !53
258
+ %222 = getelementptr float, ptr addrspace(3) @global_smem, i64 %221, !dbg !53
259
+ %223 = load i32, ptr addrspace(3) %222, align 4, !dbg !53
260
+ %224 = sext i32 %209 to i64, !dbg !54
261
+ %225 = getelementptr float, ptr addrspace(1) %8, i64 %224, !dbg !54
262
+ %226 = and i32 %13, 64, !dbg !55
263
+ %227 = icmp eq i32 %226, 0, !dbg !55
264
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %223, ptr addrspace(1) %225, i1 %227) #3, !dbg !55
265
+ %shift30 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
266
+ %228 = fadd <8 x float> %204, %shift30, !dbg !56
267
+ %shift31 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
268
+ %229 = fadd <8 x float> %shift31, %228, !dbg !56
269
+ %shift32 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
270
+ %230 = fadd <8 x float> %shift32, %229, !dbg !56
271
+ %231 = extractelement <8 x float> %230, i64 0, !dbg !56
272
+ %232 = bitcast float %231 to i32, !dbg !59
273
+ %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !59
274
+ %234 = bitcast i32 %233 to float, !dbg !59
275
+ %235 = fadd float %231, %234, !dbg !56
276
+ tail call void @llvm.nvvm.barrier0(), !dbg !61
277
+ %236 = insertelement <1 x float> undef, float %235, i64 0, !dbg !61
278
+ store <1 x float> %236, ptr addrspace(3) %219, align 4, !dbg !61
279
+ tail call void @llvm.nvvm.barrier0(), !dbg !61
280
+ %237 = load i32, ptr addrspace(3) %222, align 4, !dbg !61
281
+ %238 = getelementptr float, ptr addrspace(1) %9, i64 %224, !dbg !62
282
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %237, ptr addrspace(1) %238, i1 %227) #3, !dbg !63
283
+ ret void, !dbg !64
284
+ }
285
+
286
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
287
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
288
+
289
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
290
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
291
+
292
+ ; Function Attrs: convergent nocallback nounwind
293
+ declare void @llvm.nvvm.barrier0() #2
294
+
295
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
296
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
297
+ attributes #2 = { convergent nocallback nounwind }
298
+ attributes #3 = { nounwind }
299
+
300
+ !llvm.module.flags = !{!0}
301
+ !llvm.dbg.cu = !{!1}
302
+ !nvvm.annotations = !{!3, !4, !4, !3}
303
+
304
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
305
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
306
+ !2 = !DIFile(filename: "c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py", directory: "/tmp/torchinductor_root/3x")
307
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"kernel", i32 1}
308
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"maxntidx", i32 128}
309
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", linkageName: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
310
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
311
+ !7 = !{}
312
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
313
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
314
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
315
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
316
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
317
+ !13 = !DILocation(line: 26, column: 20, scope: !5)
318
+ !14 = !DILocation(line: 34, column: 57, scope: !5)
319
+ !15 = !DILocation(line: 37, column: 44, scope: !5)
320
+ !16 = !DILocation(line: 30, column: 36, scope: !5)
321
+ !17 = !DILocation(line: 31, column: 27, scope: !5)
322
+ !18 = !DILocation(line: 34, column: 44, scope: !5)
323
+ !19 = !DILocation(line: 34, column: 51, scope: !5)
324
+ !20 = !DILocation(line: 34, column: 34, scope: !5)
325
+ !21 = !DILocation(line: 34, column: 63, scope: !5)
326
+ !22 = !DILocation(line: 34, column: 115, scope: !5)
327
+ !23 = !DILocation(line: 35, column: 34, scope: !5)
328
+ !24 = !DILocation(line: 35, column: 63, scope: !5)
329
+ !25 = !DILocation(line: 36, column: 34, scope: !5)
330
+ !26 = !DILocation(line: 36, column: 63, scope: !5)
331
+ !27 = !DILocation(line: 36, column: 115, scope: !5)
332
+ !28 = !DILocation(line: 37, column: 40, scope: !5)
333
+ !29 = !DILocation(line: 37, column: 34, scope: !5)
334
+ !30 = !DILocation(line: 37, column: 50, scope: !5)
335
+ !31 = !DILocation(line: 38, column: 34, scope: !5)
336
+ !32 = !DILocation(line: 38, column: 50, scope: !5)
337
+ !33 = !DILocation(line: 39, column: 35, scope: !5)
338
+ !34 = !DILocation(line: 39, column: 64, scope: !5)
339
+ !35 = !DILocation(line: 39, column: 116, scope: !5)
340
+ !36 = !DILocation(line: 40, column: 35, scope: !5)
341
+ !37 = !DILocation(line: 40, column: 51, scope: !5)
342
+ !38 = !DILocation(line: 41, column: 35, scope: !5)
343
+ !39 = !DILocation(line: 41, column: 51, scope: !5)
344
+ !40 = !DILocation(line: 44, column: 22, scope: !5)
345
+ !41 = !DILocation(line: 52, column: 23, scope: !5)
346
+ !42 = !DILocation(line: 53, column: 24, scope: !5)
347
+ !43 = !DILocation(line: 54, column: 24, scope: !5)
348
+ !44 = !DILocation(line: 57, column: 40, scope: !5)
349
+ !45 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !49)
350
+ !46 = distinct !DILexicalBlockFile(scope: !48, file: !47, discriminator: 0)
351
+ !47 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
352
+ !48 = distinct !DILexicalBlockFile(scope: !5, file: !47, discriminator: 0)
353
+ !49 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !50)
354
+ !50 = !DILocation(line: 58, column: 27, scope: !46)
355
+ !51 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !52)
356
+ !52 = !DILocation(line: 58, column: 27, scope: !48)
357
+ !53 = !DILocation(line: 58, column: 30, scope: !5)
358
+ !54 = !DILocation(line: 59, column: 25, scope: !5)
359
+ !55 = !DILocation(line: 59, column: 37, scope: !5)
360
+ !56 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !57)
361
+ !57 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !58)
362
+ !58 = !DILocation(line: 60, column: 27, scope: !46)
363
+ !59 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !60)
364
+ !60 = !DILocation(line: 60, column: 27, scope: !48)
365
+ !61 = !DILocation(line: 60, column: 30, scope: !5)
366
+ !62 = !DILocation(line: 61, column: 25, scope: !5)
367
+ !63 = !DILocation(line: 61, column: 37, scope: !5)
368
+ !64 = !DILocation(line: 61, column: 4, scope: !5)
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked1>
8
+ %cst_1 = arith.constant dense<128> : tensor<64x1xi32, #blocked1>
9
+ %cst_2 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
10
+ %cst_3 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
11
+ %cst_4 = arith.constant dense<128> : tensor<1x8xi32, #blocked1>
12
+ %cst_5 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
13
+ %c0_i32 = arith.constant 0 : i32
14
+ %c128_i32 = arith.constant 128 : i32
15
+ %c8_i32 = arith.constant 8 : i32
16
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1>
17
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
18
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
19
+ %c64_i32 = arith.constant 64 : i32
20
+ %0 = tt.get_program_id x : i32
21
+ %1 = arith.muli %0, %c64_i32 : i32
22
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
23
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
24
+ %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
25
+ %5 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
26
+ %6 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
27
+ %7 = tt.expand_dims %4 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xi32, #blocked2>
28
+ %8 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
29
+ %9 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
30
+ %10 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked2>
31
+ %11 = arith.addi %8, %5 : tensor<64x1xi32, #blocked>
32
+ %12 = arith.addi %9, %6 : tensor<64x1xi32, #blocked1>
33
+ %13 = arith.addi %10, %7 : tensor<64x1xi32, #blocked2>
34
+ %14 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
35
+ %15 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
36
+ %16 = tt.expand_dims %14 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x8xi32, #blocked1>
37
+ %17 = tt.expand_dims %15 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
38
+ %18 = arith.remsi %11, %cst : tensor<64x1xi32, #blocked>
39
+ %19 = arith.divsi %11, %cst : tensor<64x1xi32, #blocked>
40
+ %20 = arith.divsi %12, %cst_0 : tensor<64x1xi32, #blocked1>
41
+ %21 = tt.broadcast %18 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
42
+ %22 = arith.muli %19, %cst_2 : tensor<64x1xi32, #blocked>
43
+ %23 = tt.broadcast %22 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
44
+ %24 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
45
+ %25 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
46
+ %26 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
47
+ %27 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked1>
48
+ %28 = tt.broadcast %27 : (tensor<64x1xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
49
+ %29 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
50
+ %30 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
51
+ %31 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
52
+ %32 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
53
+ %33 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
54
+ %34:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_6, %arg14 = %cst_6) -> (tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>) : i32 {
55
+ %45 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked1>
56
+ %46 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked>
57
+ %47 = arith.addi %45, %16 : tensor<1x8xi32, #blocked1>
58
+ %48 = arith.addi %46, %17 : tensor<1x8xi32, #blocked>
59
+ %49 = arith.cmpi slt, %47, %cst_4 : tensor<1x8xi32, #blocked1>
60
+ %50 = arith.cmpi slt, %48, %cst_5 : tensor<1x8xi32, #blocked>
61
+ %51 = arith.muli %48, %cst_3 : tensor<1x8xi32, #blocked>
62
+ %52 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
63
+ %53 = arith.addi %21, %52 : tensor<64x8xi32, #blocked>
64
+ %54 = arith.addi %53, %23 : tensor<64x8xi32, #blocked>
65
+ %55 = tt.addptr %24, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
66
+ %56 = tt.broadcast %49 : (tensor<1x8xi1, #blocked1>) -> tensor<64x8xi1, #blocked1>
67
+ %57 = tt.broadcast %50 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
68
+ %58 = tt.load %55, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
69
+ %59 = triton_gpu.convert_layout %58 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1>
70
+ %60 = arith.extf %59 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1>
71
+ %61 = tt.addptr %25, %54 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
72
+ %62 = tt.load %61, %57, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
73
+ %63 = triton_gpu.convert_layout %62 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1>
74
+ %64 = tt.addptr %26, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
75
+ %65 = tt.load %64, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
76
+ %66 = arith.extf %65 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
77
+ %67 = tt.broadcast %47 : (tensor<1x8xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
78
+ %68 = arith.addi %67, %28 : tensor<64x8xi32, #blocked1>
79
+ %69 = tt.addptr %29, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
80
+ %70 = tt.load %69, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
81
+ %71 = tt.addptr %30, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
82
+ %72 = tt.load %71, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
83
+ %73 = tt.addptr %31, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
84
+ %74 = tt.load %73, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
85
+ %75 = triton_gpu.convert_layout %74 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1>
86
+ %76 = arith.extf %75 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1>
87
+ %77 = tt.addptr %32, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
88
+ %78 = tt.load %77, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
89
+ %79 = tt.addptr %33, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
90
+ %80 = tt.load %79, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
91
+ %81 = arith.addf %62, %66 : tensor<64x8xf32, #blocked>
92
+ %82 = triton_gpu.convert_layout %81 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1>
93
+ %83 = arith.subf %82, %70 : tensor<64x8xf32, #blocked1>
94
+ %84 = arith.mulf %83, %72 : tensor<64x8xf32, #blocked1>
95
+ %85 = arith.mulf %60, %84 : tensor<64x8xf32, #blocked1>
96
+ %86 = arith.addf %arg13, %85 : tensor<64x8xf32, #blocked1>
97
+ %87 = arith.select %56, %86, %arg13 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1>
98
+ %88 = arith.subf %63, %78 : tensor<64x8xf32, #blocked1>
99
+ %89 = arith.mulf %88, %80 : tensor<64x8xf32, #blocked1>
100
+ %90 = arith.mulf %76, %89 : tensor<64x8xf32, #blocked1>
101
+ %91 = arith.addf %arg14, %90 : tensor<64x8xf32, #blocked1>
102
+ %92 = arith.select %56, %91, %arg14 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1>
103
+ scf.yield %87, %92 : tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>
104
+ }
105
+ %35 = "tt.reduce"(%34#0) <{axis = 1 : i32}> ({
106
+ ^bb0(%arg12: f32, %arg13: f32):
107
+ %45 = arith.addf %arg12, %arg13 : f32
108
+ tt.reduce.return %45 : f32
109
+ }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
110
+ %36 = triton_gpu.convert_layout %35 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
111
+ %37 = tt.expand_dims %36 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2>
112
+ %38 = tt.splat %arg8 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked2>
113
+ %39 = tt.addptr %38, %13 : tensor<64x1x!tt.ptr<f32, 1>, #blocked2>, tensor<64x1xi32, #blocked2>
114
+ tt.store %39, %37 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2>
115
+ %40 = "tt.reduce"(%34#1) <{axis = 1 : i32}> ({
116
+ ^bb0(%arg12: f32, %arg13: f32):
117
+ %45 = arith.addf %arg12, %arg13 : f32
118
+ tt.reduce.return %45 : f32
119
+ }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
120
+ %41 = triton_gpu.convert_layout %40 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
121
+ %42 = tt.expand_dims %41 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2>
122
+ %43 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked2>
123
+ %44 = tt.addptr %43, %13 : tensor<64x1x!tt.ptr<f32, 1>, #blocked2>, tensor<64x1xi32, #blocked2>
124
+ tt.store %44, %42 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2>
125
+ tt.return
126
+ }
127
+ }
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
6
+
7
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
8
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
9
+ %10 = and i32 %9, 31, !dbg !10
10
+ %11 = lshr i32 %9, 5, !dbg !10
11
+ %12 = and i32 %11, 1, !dbg !10
12
+ %urem = shl i32 %9, 2, !dbg !10
13
+ %13 = and i32 %urem, 252, !dbg !10
14
+ %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
15
+ %15 = shl i32 %14, 8, !dbg !12
16
+ %16 = or i32 %15, %13, !dbg !13
17
+ %17 = sext i32 %16 to i64, !dbg !14
18
+ %18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !14
19
+ %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
20
+ %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !15
21
+ %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !15
22
+ %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !15
23
+ %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !15
24
+ %24 = bitcast i32 %22 to float, !dbg !15
25
+ %25 = bitcast i32 %23 to float, !dbg !15
26
+ %26 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !16
27
+ %27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
28
+ %28 = extractvalue { i32, i32 } %27, 0, !dbg !17
29
+ %29 = extractvalue { i32, i32 } %27, 1, !dbg !17
30
+ %30 = trunc i32 %28 to i16, !dbg !17
31
+ %extelt.offset = lshr i32 %28, 16, !dbg !17
32
+ %31 = trunc i32 %extelt.offset to i16, !dbg !17
33
+ %32 = trunc i32 %29 to i16, !dbg !17
34
+ %extelt.offset1 = lshr i32 %29, 16, !dbg !17
35
+ %33 = trunc i32 %extelt.offset1 to i16, !dbg !17
36
+ %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
37
+ %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
38
+ %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
39
+ %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
40
+ %38 = getelementptr i16, ptr addrspace(1) %2, i64 %17, !dbg !19
41
+ %39 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %38, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
42
+ %40 = extractvalue { i32, i32 } %39, 0, !dbg !20
43
+ %41 = extractvalue { i32, i32 } %39, 1, !dbg !20
44
+ %42 = trunc i32 %40 to i16, !dbg !20
45
+ %extelt.offset2 = lshr i32 %40, 16, !dbg !20
46
+ %43 = trunc i32 %extelt.offset2 to i16, !dbg !20
47
+ %44 = trunc i32 %41 to i16, !dbg !20
48
+ %extelt.offset3 = lshr i32 %41, 16, !dbg !20
49
+ %45 = trunc i32 %extelt.offset3 to i16, !dbg !20
50
+ %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
51
+ %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
52
+ %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
53
+ %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !21
54
+ %50 = getelementptr i16, ptr addrspace(1) %3, i64 %17, !dbg !22
55
+ %51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
56
+ %52 = extractvalue { i32, i32 } %51, 0, !dbg !23
57
+ %53 = extractvalue { i32, i32 } %51, 1, !dbg !23
58
+ %54 = trunc i32 %52 to i16, !dbg !23
59
+ %extelt.offset4 = lshr i32 %52, 16, !dbg !23
60
+ %55 = trunc i32 %extelt.offset4 to i16, !dbg !23
61
+ %56 = trunc i32 %53 to i16, !dbg !23
62
+ %extelt.offset5 = lshr i32 %53, 16, !dbg !23
63
+ %57 = trunc i32 %extelt.offset5 to i16, !dbg !23
64
+ %58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !24
65
+ %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !24
66
+ %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !24
67
+ %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !24
68
+ %62 = zext nneg i32 %13 to i64, !dbg !25
69
+ %63 = getelementptr float, ptr addrspace(1) %4, i64 %62, !dbg !25
70
+ %64 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %63, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
71
+ %65 = fadd float %36, %24, !dbg !27
72
+ %66 = fadd float %37, %25, !dbg !27
73
+ %67 = fadd float %65, %48, !dbg !28
74
+ %68 = fadd float %66, %49, !dbg !28
75
+ %69 = insertelement <2 x i32> poison, i32 %20, i64 0, !dbg !15
76
+ %70 = insertelement <2 x i32> %69, i32 %21, i64 1, !dbg !15
77
+ %71 = bitcast <2 x i32> %70 to <2 x float>, !dbg !15
78
+ %72 = insertelement <2 x float> poison, float %34, i64 0, !dbg !27
79
+ %73 = insertelement <2 x float> %72, float %35, i64 1, !dbg !27
80
+ %74 = fadd <2 x float> %73, %71, !dbg !27
81
+ %75 = insertelement <2 x float> poison, float %46, i64 0, !dbg !28
82
+ %76 = insertelement <2 x float> %75, float %47, i64 1, !dbg !28
83
+ %77 = fadd <2 x float> %74, %76, !dbg !28
84
+ %78 = insertelement <2 x float> poison, float %58, i64 0, !dbg !29
85
+ %79 = insertelement <2 x float> %78, float %59, i64 1, !dbg !29
86
+ %80 = fadd <2 x float> %77, %79, !dbg !29
87
+ %81 = fadd float %67, %60, !dbg !29
88
+ %82 = fadd float %68, %61, !dbg !29
89
+ %83 = extractelement <2 x float> %80, i64 0, !dbg !30
90
+ %84 = extractelement <2 x float> %80, i64 1, !dbg !30
91
+ %85 = fadd float %83, %84, !dbg !30
92
+ %86 = fadd float %85, %81, !dbg !30
93
+ %87 = fadd float %86, %82, !dbg !30
94
+ %88 = bitcast float %87 to i32, !dbg !36
95
+ %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 16, i32 31), !dbg !36
96
+ %90 = bitcast i32 %89 to float, !dbg !36
97
+ %91 = fadd float %87, %90, !dbg !30
98
+ %92 = bitcast float %91 to i32, !dbg !36
99
+ %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 8, i32 31), !dbg !36
100
+ %94 = bitcast i32 %93 to float, !dbg !36
101
+ %95 = fadd float %91, %94, !dbg !30
102
+ %96 = bitcast float %95 to i32, !dbg !36
103
+ %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 4, i32 31), !dbg !36
104
+ %98 = bitcast i32 %97 to float, !dbg !36
105
+ %99 = fadd float %95, %98, !dbg !30
106
+ %100 = bitcast float %99 to i32, !dbg !36
107
+ %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 2, i32 31), !dbg !36
108
+ %102 = bitcast i32 %101 to float, !dbg !36
109
+ %103 = fadd float %99, %102, !dbg !30
110
+ %104 = bitcast float %103 to i32, !dbg !36
111
+ %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 1, i32 31), !dbg !36
112
+ %106 = bitcast i32 %105 to float, !dbg !36
113
+ %107 = fadd float %103, %106, !dbg !30
114
+ %108 = icmp eq i32 %10, 0, !dbg !36
115
+ %109 = zext nneg i32 %12 to i64, !dbg !36
116
+ %110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !36
117
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %107, i1 %108) #6, !dbg !36
118
+ tail call void @llvm.nvvm.barrier0(), !dbg !36
119
+ %111 = icmp slt i32 %9, 2, !dbg !36
120
+ %112 = sext i32 %9 to i64, !dbg !36
121
+ %113 = getelementptr float, ptr addrspace(3) @global_smem, i64 %112, !dbg !36
122
+ %114 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !36
123
+ %115 = bitcast float %114 to i32, !dbg !36
124
+ %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 1, i32 31), !dbg !36
125
+ %117 = bitcast i32 %116 to float, !dbg !36
126
+ %118 = fadd float %114, %117, !dbg !30
127
+ %119 = and i32 %9, 1, !dbg !36
128
+ %120 = icmp eq i32 %119, 0, !dbg !36
129
+ %121 = and i1 %111, %120, !dbg !36
130
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %118, i1 %121) #6, !dbg !36
131
+ tail call void @llvm.nvvm.barrier0(), !dbg !36
132
+ %122 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !36
133
+ %123 = fadd float %122, 0.000000e+00, !dbg !38
134
+ %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %123, float 2.560000e+02) #6, !dbg !42
135
+ %125 = fsub float %83, %124, !dbg !43
136
+ %126 = fsub float %84, %124, !dbg !43
137
+ %127 = fsub float %81, %124, !dbg !43
138
+ %128 = fsub float %82, %124, !dbg !43
139
+ %129 = fmul float %125, %125, !dbg !44
140
+ %130 = fmul float %126, %126, !dbg !44
141
+ %131 = fmul float %127, %127, !dbg !44
142
+ %132 = fmul float %128, %128, !dbg !44
143
+ tail call void @llvm.nvvm.barrier0(), !dbg !45
144
+ %133 = fadd float %129, %130, !dbg !47
145
+ %134 = fadd float %131, %133, !dbg !47
146
+ %135 = fadd float %132, %134, !dbg !47
147
+ %136 = bitcast float %135 to i32, !dbg !45
148
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !45
149
+ %138 = bitcast i32 %137 to float, !dbg !45
150
+ %139 = fadd float %135, %138, !dbg !47
151
+ %140 = bitcast float %139 to i32, !dbg !45
152
+ %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !45
153
+ %142 = bitcast i32 %141 to float, !dbg !45
154
+ %143 = fadd float %139, %142, !dbg !47
155
+ %144 = bitcast float %143 to i32, !dbg !45
156
+ %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !45
157
+ %146 = bitcast i32 %145 to float, !dbg !45
158
+ %147 = fadd float %143, %146, !dbg !47
159
+ %148 = bitcast float %147 to i32, !dbg !45
160
+ %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !45
161
+ %150 = bitcast i32 %149 to float, !dbg !45
162
+ %151 = fadd float %147, %150, !dbg !47
163
+ %152 = bitcast float %151 to i32, !dbg !45
164
+ %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !45
165
+ %154 = bitcast i32 %153 to float, !dbg !45
166
+ %155 = fadd float %151, %154, !dbg !47
167
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %155, i1 %108) #6, !dbg !45
168
+ tail call void @llvm.nvvm.barrier0(), !dbg !45
169
+ %156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !45
170
+ %157 = bitcast float %156 to i32, !dbg !45
171
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !45
172
+ %159 = bitcast i32 %158 to float, !dbg !45
173
+ %160 = fadd float %156, %159, !dbg !47
174
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %160, i1 %121) #6, !dbg !45
175
+ tail call void @llvm.nvvm.barrier0(), !dbg !45
176
+ %161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !45
177
+ %162 = fadd float %161, 0.000000e+00, !dbg !50
178
+ %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float 2.560000e+02) #6, !dbg !52
179
+ %164 = fadd float %163, 0x3EE4F8B580000000, !dbg !53
180
+ %165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !54
181
+ %.not.i = icmp eq i32 %165, 0, !dbg !54
182
+ br i1 %.not.i, label %168, label %166, !dbg !54
183
+
184
+ 166: ; preds = %8
185
+ %167 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %164), !dbg !54
186
+ br label %__nv_rsqrtf.exit, !dbg !54
187
+
188
+ 168: ; preds = %8
189
+ %169 = tail call float @llvm.nvvm.rsqrt.approx.f(float %164), !dbg !54
190
+ br label %__nv_rsqrtf.exit, !dbg !54
191
+
192
+ __nv_rsqrtf.exit: ; preds = %166, %168
193
+ %.0.i = phi float [ %167, %166 ], [ %169, %168 ], !dbg !54
194
+ %170 = extractvalue { i32, i32, i32, i32 } %64, 3, !dbg !26
195
+ %171 = bitcast i32 %170 to float, !dbg !26
196
+ %172 = extractvalue { i32, i32, i32, i32 } %64, 2, !dbg !26
197
+ %173 = bitcast i32 %172 to float, !dbg !26
198
+ %174 = extractvalue { i32, i32, i32, i32 } %64, 1, !dbg !26
199
+ %175 = bitcast i32 %174 to float, !dbg !26
200
+ %176 = extractvalue { i32, i32, i32, i32 } %64, 0, !dbg !26
201
+ %177 = bitcast i32 %176 to float, !dbg !26
202
+ %178 = fmul float %125, %.0.i, !dbg !55
203
+ %179 = fmul float %126, %.0.i, !dbg !55
204
+ %180 = fmul float %127, %.0.i, !dbg !55
205
+ %181 = fmul float %128, %.0.i, !dbg !55
206
+ %182 = fmul float %178, %177, !dbg !56
207
+ %183 = fmul float %179, %175, !dbg !56
208
+ %184 = fmul float %180, %173, !dbg !56
209
+ %185 = fmul float %181, %171, !dbg !56
210
+ %186 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !57
211
+ %187 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %182) #6, !dbg !58
212
+ %188 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %183) #6, !dbg !58
213
+ %189 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %184) #6, !dbg !58
214
+ %190 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %185) #6, !dbg !58
215
+ %191 = insertelement <2 x i16> undef, i16 %187, i64 0, !dbg !58
216
+ %192 = insertelement <2 x i16> %191, i16 %188, i64 1, !dbg !58
217
+ %193 = bitcast <2 x i16> %192 to i32, !dbg !58
218
+ %194 = insertelement <2 x i16> undef, i16 %189, i64 0, !dbg !58
219
+ %195 = insertelement <2 x i16> %194, i16 %190, i64 1, !dbg !58
220
+ %196 = bitcast <2 x i16> %195 to i32, !dbg !58
221
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %193, i32 %196, ptr addrspace(1) %186, i1 true) #6, !dbg !58
222
+ ret void, !dbg !59
223
+ }
224
+
225
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
226
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
227
+
228
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
229
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
230
+
231
+ ; Function Attrs: convergent nocallback nounwind
232
+ declare void @llvm.nvvm.barrier0() #2
233
+
234
+ ; Function Attrs: alwaysinline nounwind
235
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
236
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
237
+ %.not = icmp eq i32 %1, 0
238
+ br i1 %.not, label %4, label %2
239
+
240
+ 2: ; preds = %0
241
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
242
+ br label %6
243
+
244
+ 4: ; preds = %0
245
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
246
+ br label %6
247
+
248
+ 6: ; preds = %4, %2
249
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
250
+ ret float %.0
251
+ }
252
+
253
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
254
+
255
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
256
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
257
+
258
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
259
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
260
+
261
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
262
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
263
+ attributes #2 = { convergent nocallback nounwind }
264
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
265
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
266
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
267
+ attributes #6 = { nounwind }
268
+
269
+ !llvm.module.flags = !{!0, !1}
270
+ !llvm.dbg.cu = !{!2}
271
+ !nvvm.annotations = !{!4, !5, !5, !4}
272
+ !llvm.ident = !{!6}
273
+
274
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
275
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
276
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
277
+ !3 = !DIFile(filename: "c4qmi2qsgi5mnuig7w3wx5jmjnmvktjlgcv4c6q7w2vaw3bk6qzb.py", directory: "/tmp/torchinductor_root/4q")
278
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
279
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
280
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
281
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
282
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
283
+ !9 = !{}
284
+ !10 = !DILocation(line: 26, column: 26, scope: !7)
285
+ !11 = !DILocation(line: 23, column: 28, scope: !7)
286
+ !12 = !DILocation(line: 30, column: 40, scope: !7)
287
+ !13 = !DILocation(line: 30, column: 36, scope: !7)
288
+ !14 = !DILocation(line: 30, column: 30, scope: !7)
289
+ !15 = !DILocation(line: 30, column: 46, scope: !7)
290
+ !16 = !DILocation(line: 31, column: 30, scope: !7)
291
+ !17 = !DILocation(line: 31, column: 46, scope: !7)
292
+ !18 = !DILocation(line: 31, column: 67, scope: !7)
293
+ !19 = !DILocation(line: 32, column: 30, scope: !7)
294
+ !20 = !DILocation(line: 32, column: 46, scope: !7)
295
+ !21 = !DILocation(line: 32, column: 67, scope: !7)
296
+ !22 = !DILocation(line: 33, column: 30, scope: !7)
297
+ !23 = !DILocation(line: 33, column: 46, scope: !7)
298
+ !24 = !DILocation(line: 33, column: 67, scope: !7)
299
+ !25 = !DILocation(line: 34, column: 31, scope: !7)
300
+ !26 = !DILocation(line: 34, column: 36, scope: !7)
301
+ !27 = !DILocation(line: 36, column: 18, scope: !7)
302
+ !28 = !DILocation(line: 38, column: 18, scope: !7)
303
+ !29 = !DILocation(line: 40, column: 18, scope: !7)
304
+ !30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !34)
305
+ !31 = distinct !DILexicalBlockFile(scope: !33, file: !32, discriminator: 0)
306
+ !32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
307
+ !33 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
308
+ !34 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !35)
309
+ !35 = !DILocation(line: 45, column: 59, scope: !31)
310
+ !36 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !37)
311
+ !37 = !DILocation(line: 45, column: 59, scope: !33)
312
+ !38 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !41)
313
+ !39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
314
+ !40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
315
+ !41 = !DILocation(line: 45, column: 45, scope: !39)
316
+ !42 = !DILocation(line: 48, column: 20, scope: !7)
317
+ !43 = !DILocation(line: 49, column: 20, scope: !7)
318
+ !44 = !DILocation(line: 50, column: 20, scope: !7)
319
+ !45 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !46)
320
+ !46 = !DILocation(line: 53, column: 59, scope: !33)
321
+ !47 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !48)
322
+ !48 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !49)
323
+ !49 = !DILocation(line: 53, column: 59, scope: !31)
324
+ !50 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !51)
325
+ !51 = !DILocation(line: 53, column: 45, scope: !39)
326
+ !52 = !DILocation(line: 56, column: 20, scope: !7)
327
+ !53 = !DILocation(line: 58, column: 20, scope: !7)
328
+ !54 = !DILocation(line: 59, column: 26, scope: !7)
329
+ !55 = !DILocation(line: 60, column: 20, scope: !7)
330
+ !56 = !DILocation(line: 61, column: 20, scope: !7)
331
+ !57 = !DILocation(line: 63, column: 25, scope: !7)
332
+ !58 = !DILocation(line: 63, column: 48, scope: !7)
333
+ !59 = !DILocation(line: 63, column: 4, scope: !7)
.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.cubin ADDED
Binary file (13.1 kB). View file
 
.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.llir ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 31, !dbg !8
9
+ %8 = lshr i32 %6, 5, !dbg !8
10
+ %9 = shl i32 %6, 2, !dbg !8
11
+ %10 = and i32 %9, 60, !dbg !8
12
+ %11 = and i32 %8, 3, !dbg !9
13
+ %12 = lshr i32 %7, 4, !dbg !9
14
+ %13 = shl nuw nsw i32 %11, 1, !dbg !9
15
+ %14 = or i32 %13, %12, !dbg !9
16
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
17
+ %16 = shl i32 %15, 6, !dbg !11
18
+ %17 = or i32 %16, %10, !dbg !12
19
+ br label %18, !dbg !13
20
+
21
+ 18: ; preds = %5, %18
22
+ %19 = phi i32 [ 0, %5 ], [ %37, %18 ]
23
+ %20 = phi <4 x float> [ zeroinitializer, %5 ], [ %36, %18 ]
24
+ %21 = or i32 %19, %14, !dbg !14
25
+ %22 = shl i32 %21, 17, !dbg !15
26
+ %23 = add i32 %17, %22, !dbg !16
27
+ %24 = sext i32 %23 to i64, !dbg !17
28
+ %25 = getelementptr float, ptr addrspace(1) %0, i64 %24, !dbg !17
29
+ %26 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
30
+ %27 = extractvalue { i32, i32, i32, i32 } %26, 0, !dbg !18
31
+ %28 = extractvalue { i32, i32, i32, i32 } %26, 1, !dbg !18
32
+ %29 = extractvalue { i32, i32, i32, i32 } %26, 2, !dbg !18
33
+ %30 = extractvalue { i32, i32, i32, i32 } %26, 3, !dbg !18
34
+ %31 = insertelement <4 x i32> poison, i32 %27, i64 0, !dbg !18
35
+ %32 = insertelement <4 x i32> %31, i32 %28, i64 1, !dbg !18
36
+ %33 = insertelement <4 x i32> %32, i32 %29, i64 2, !dbg !18
37
+ %34 = insertelement <4 x i32> %33, i32 %30, i64 3, !dbg !18
38
+ %35 = bitcast <4 x i32> %34 to <4 x float>, !dbg !18
39
+ %36 = fadd <4 x float> %20, %35, !dbg !19
40
+ %37 = add nuw nsw i32 %19, 8, !dbg !13
41
+ %38 = icmp ult i32 %19, 112, !dbg !13
42
+ br i1 %38, label %18, label %39, !dbg !13
43
+
44
+ 39: ; preds = %18
45
+ %40 = and i32 %6, 63, !dbg !8
46
+ %41 = or i32 %16, %40, !dbg !12
47
+ %42 = or i32 %10, 3, !dbg !20
48
+ %43 = or i32 %10, 2, !dbg !20
49
+ %44 = or i32 %10, 1, !dbg !20
50
+ %45 = extractelement <4 x float> %36, i64 0, !dbg !20
51
+ %46 = bitcast float %45 to i32, !dbg !20
52
+ %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !20
53
+ %48 = bitcast i32 %47 to float, !dbg !20
54
+ %49 = fadd float %45, %48, !dbg !24
55
+ %50 = extractelement <4 x float> %36, i64 1, !dbg !20
56
+ %51 = bitcast float %50 to i32, !dbg !20
57
+ %52 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %51, i32 16, i32 31), !dbg !20
58
+ %53 = bitcast i32 %52 to float, !dbg !20
59
+ %54 = fadd float %50, %53, !dbg !24
60
+ %55 = extractelement <4 x float> %36, i64 2, !dbg !20
61
+ %56 = bitcast float %55 to i32, !dbg !20
62
+ %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 16, i32 31), !dbg !20
63
+ %58 = bitcast i32 %57 to float, !dbg !20
64
+ %59 = fadd float %55, %58, !dbg !24
65
+ %60 = extractelement <4 x float> %36, i64 3, !dbg !20
66
+ %61 = bitcast float %60 to i32, !dbg !20
67
+ %62 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %61, i32 16, i32 31), !dbg !20
68
+ %63 = bitcast i32 %62 to float, !dbg !20
69
+ %64 = fadd float %60, %63, !dbg !24
70
+ %65 = icmp ult i32 %7, 16, !dbg !20
71
+ %66 = shl nuw nsw i32 %10, 2, !dbg !20
72
+ %67 = or i32 %66, %11, !dbg !20
73
+ %68 = zext nneg i32 %67 to i64, !dbg !20
74
+ %69 = getelementptr float, ptr addrspace(3) @global_smem, i64 %68, !dbg !20
75
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %69, float %49, i1 %65) #3, !dbg !20
76
+ %70 = shl nuw nsw i32 %44, 2, !dbg !20
77
+ %71 = or i32 %70, %11, !dbg !20
78
+ %72 = zext nneg i32 %71 to i64, !dbg !20
79
+ %73 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !20
80
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %54, i1 %65) #3, !dbg !20
81
+ %74 = shl nuw nsw i32 %43, 2, !dbg !20
82
+ %75 = or i32 %74, %11, !dbg !20
83
+ %76 = zext nneg i32 %75 to i64, !dbg !20
84
+ %77 = getelementptr float, ptr addrspace(3) @global_smem, i64 %76, !dbg !20
85
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, float %59, i1 %65) #3, !dbg !20
86
+ %78 = shl nuw nsw i32 %42, 2, !dbg !20
87
+ %79 = or i32 %78, %11, !dbg !20
88
+ %80 = zext nneg i32 %79 to i64, !dbg !20
89
+ %81 = getelementptr float, ptr addrspace(3) @global_smem, i64 %80, !dbg !20
90
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %81, float %64, i1 %65) #3, !dbg !20
91
+ tail call void @llvm.nvvm.barrier0(), !dbg !20
92
+ %82 = icmp slt i32 %6, 256, !dbg !20
93
+ %83 = sext i32 %6 to i64, !dbg !20
94
+ %84 = getelementptr float, ptr addrspace(3) @global_smem, i64 %83, !dbg !20
95
+ %85 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %84, i1 %82) #3, !dbg !20
96
+ %86 = bitcast float %85 to i32, !dbg !20
97
+ %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 2, i32 31), !dbg !20
98
+ %88 = bitcast i32 %87 to float, !dbg !20
99
+ %89 = fadd float %85, %88, !dbg !24
100
+ %90 = bitcast float %89 to i32, !dbg !20
101
+ %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 1, i32 31), !dbg !20
102
+ %92 = bitcast i32 %91 to float, !dbg !20
103
+ %93 = fadd float %89, %92, !dbg !24
104
+ %94 = and i32 %6, 3, !dbg !20
105
+ %95 = icmp eq i32 %94, 0, !dbg !20
106
+ %96 = and i1 %82, %95, !dbg !20
107
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, float %93, i1 %96) #3, !dbg !20
108
+ %97 = add i32 %6, 128, !dbg !20
109
+ %98 = sext i32 %97 to i64, !dbg !20
110
+ %99 = getelementptr float, ptr addrspace(3) @global_smem, i64 %98, !dbg !20
111
+ %100 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %99, i1 %82) #3, !dbg !20
112
+ %101 = bitcast float %100 to i32, !dbg !20
113
+ %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 2, i32 31), !dbg !20
114
+ %103 = bitcast i32 %102 to float, !dbg !20
115
+ %104 = fadd float %100, %103, !dbg !24
116
+ %105 = bitcast float %104 to i32, !dbg !20
117
+ %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 1, i32 31), !dbg !20
118
+ %107 = bitcast i32 %106 to float, !dbg !20
119
+ %108 = fadd float %104, %107, !dbg !24
120
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %99, float %108, i1 %96) #3, !dbg !20
121
+ tail call void @llvm.nvvm.barrier0(), !dbg !20
122
+ %109 = zext nneg i32 %66 to i64, !dbg !20
123
+ %110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !20
124
+ %111 = load float, ptr addrspace(3) %110, align 4, !dbg !20
125
+ %112 = zext nneg i32 %70 to i64, !dbg !20
126
+ %113 = getelementptr float, ptr addrspace(3) @global_smem, i64 %112, !dbg !20
127
+ %114 = load float, ptr addrspace(3) %113, align 4, !dbg !20
128
+ %115 = zext nneg i32 %74 to i64, !dbg !20
129
+ %116 = getelementptr float, ptr addrspace(3) @global_smem, i64 %115, !dbg !20
130
+ %117 = load float, ptr addrspace(3) %116, align 4, !dbg !20
131
+ %118 = zext nneg i32 %78 to i64, !dbg !20
132
+ %119 = getelementptr float, ptr addrspace(3) @global_smem, i64 %118, !dbg !20
133
+ %120 = load float, ptr addrspace(3) %119, align 4, !dbg !20
134
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
135
+ %121 = zext nneg i32 %10 to i64, !dbg !28
136
+ %122 = getelementptr float, ptr addrspace(3) @global_smem, i64 %121, !dbg !28
137
+ %123 = insertelement <1 x float> undef, float %111, i64 0, !dbg !28
138
+ store <1 x float> %123, ptr addrspace(3) %122, align 4, !dbg !28
139
+ %124 = zext nneg i32 %44 to i64, !dbg !28
140
+ %125 = getelementptr float, ptr addrspace(3) @global_smem, i64 %124, !dbg !28
141
+ %126 = insertelement <1 x float> undef, float %114, i64 0, !dbg !28
142
+ store <1 x float> %126, ptr addrspace(3) %125, align 4, !dbg !28
143
+ %127 = zext nneg i32 %43 to i64, !dbg !28
144
+ %128 = getelementptr float, ptr addrspace(3) @global_smem, i64 %127, !dbg !28
145
+ %129 = insertelement <1 x float> undef, float %117, i64 0, !dbg !28
146
+ store <1 x float> %129, ptr addrspace(3) %128, align 4, !dbg !28
147
+ %130 = zext nneg i32 %42 to i64, !dbg !28
148
+ %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !28
149
+ %132 = insertelement <1 x float> undef, float %120, i64 0, !dbg !28
150
+ store <1 x float> %132, ptr addrspace(3) %131, align 4, !dbg !28
151
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
152
+ %133 = zext nneg i32 %40 to i64, !dbg !28
153
+ %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !28
154
+ %135 = load <1 x float>, ptr addrspace(3) %134, align 4, !dbg !28
155
+ %.frozen = freeze i32 %41
156
+ %136 = sdiv i32 %.frozen, 256, !dbg !29
157
+ %137 = mul i32 %136, 256
158
+ %.decomposed = sub i32 %.frozen, %137
159
+ %138 = sext i32 %136 to i64, !dbg !30
160
+ %139 = getelementptr i64, ptr addrspace(1) %1, i64 %138, !dbg !30
161
+ %140 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %139, i1 true) #3, !dbg !31
162
+ %141 = lshr i64 %140, 54, !dbg !32
163
+ %142 = and i64 %141, 512, !dbg !32
164
+ %143 = add i64 %142, %140, !dbg !32
165
+ %144 = shl i64 %143, 8, !dbg !33
166
+ %145 = sext i32 %.decomposed to i64, !dbg !34
167
+ %146 = getelementptr float, ptr addrspace(1) %2, i64 %144, !dbg !35
168
+ %147 = getelementptr float, ptr addrspace(1) %146, i64 %145, !dbg !35
169
+ %148 = and i32 %6, 64, !dbg !36
170
+ %149 = icmp eq i32 %148, 0, !dbg !36
171
+ %150 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %147, <1 x float> %135, i1 %149) #3, !dbg !36
172
+ ret void, !dbg !37
173
+ }
174
+
175
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
176
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
177
+
178
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
179
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
180
+
181
+ ; Function Attrs: convergent nocallback nounwind
182
+ declare void @llvm.nvvm.barrier0() #2
183
+
184
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
185
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
186
+ attributes #2 = { convergent nocallback nounwind }
187
+ attributes #3 = { nounwind }
188
+
189
+ !llvm.module.flags = !{!0}
190
+ !llvm.dbg.cu = !{!1}
191
+ !nvvm.annotations = !{!3, !4, !4, !3}
192
+
193
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
194
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
195
+ !2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
196
+ !3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
197
+ !4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 128}
198
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
199
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
200
+ !7 = !{}
201
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
202
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
203
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
204
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
205
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
206
+ !13 = !DILocation(line: 27, column: 36, scope: !5)
207
+ !14 = !DILocation(line: 28, column: 27, scope: !5)
208
+ !15 = !DILocation(line: 31, column: 47, scope: !5)
209
+ !16 = !DILocation(line: 31, column: 40, scope: !5)
210
+ !17 = !DILocation(line: 31, column: 34, scope: !5)
211
+ !18 = !DILocation(line: 31, column: 53, scope: !5)
212
+ !19 = !DILocation(line: 34, column: 38, scope: !5)
213
+ !20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
214
+ !21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
215
+ !22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
216
+ !23 = !DILocation(line: 35, column: 25, scope: !21)
217
+ !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
218
+ !25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
219
+ !26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
220
+ !27 = !DILocation(line: 35, column: 25, scope: !25)
221
+ !28 = !DILocation(line: 35, column: 28, scope: !5)
222
+ !29 = !DILocation(line: 36, column: 20, scope: !5)
223
+ !30 = !DILocation(line: 38, column: 30, scope: !5)
224
+ !31 = !DILocation(line: 38, column: 35, scope: !5)
225
+ !32 = !DILocation(line: 41, column: 32, scope: !5)
226
+ !33 = !DILocation(line: 45, column: 40, scope: !5)
227
+ !34 = !DILocation(line: 45, column: 36, scope: !5)
228
+ !35 = !DILocation(line: 45, column: 30, scope: !5)
229
+ !36 = !DILocation(line: 45, column: 55, scope: !5)
230
+ !37 = !DILocation(line: 45, column: 4, scope: !5)
.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ptx ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+
11
+ .visible .entry triton__0d1d2de(
12
+ .param .u64 triton__0d1d2de_param_0,
13
+ .param .u64 triton__0d1d2de_param_1,
14
+ .param .u32 triton__0d1d2de_param_2
15
+ )
16
+ .maxntid 128, 1, 1
17
+ {
18
+ .reg .pred %p<3>;
19
+ .reg .b16 %rs<3>;
20
+ .reg .b32 %r<12>;
21
+ .reg .b64 %rd<7>;
22
+ .loc 1 18 0
23
+ $L__func_begin0:
24
+ .loc 1 18 0
25
+
26
+ ld.param.u64 %rd3, [triton__0d1d2de_param_0];
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_1];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r7, %tid.x;
31
+ shl.b32 %r8, %r7, 1;
32
+ and.b32 %r9, %r8, 254;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r10, %r1, 8;
37
+ .loc 1 21 23
38
+ or.b32 %r11, %r10, %r9;
39
+ .loc 1 24 30
40
+ mul.wide.s32 %rd5, %r11, 2;
41
+ add.s64 %rd1, %rd3, %rd5;
42
+ mov.pred %p1, -1;
43
+ .loc 1 24 35
44
+ mov.u32 %r2, 0x0;
45
+ @%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ];
46
+ cvt.u16.u32 %rs1, %r2;
47
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
48
+ .loc 1 24 44
49
+ cvt.f32.bf16 %r5, %rs1;
50
+ cvt.f32.bf16 %r6, %rs2;
51
+ .loc 1 26 25
52
+ mul.wide.s32 %rd6, %r11, 4;
53
+ add.s64 %rd2, %rd4, %rd6;
54
+ .loc 1 26 36
55
+ @%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 };
56
+ .loc 1 26 4
57
+ ret;
58
+ $L__tmp1:
59
+ $L__func_end0:
60
+
61
+ }
62
+ .file 1 "/tmp/torchinductor_root/zl/czl6nmwasl7k4ic55xowihczcooh3mhu5v6ls6w2xzqqocdc2da7.py"
63
+ .section .debug_abbrev
64
+ {
65
+ .b8 1
66
+ .b8 17
67
+ .b8 1
68
+ .b8 37
69
+ .b8 8
70
+ .b8 19
71
+ .b8 5
72
+ .b8 3
73
+ .b8 8
74
+ .b8 16
75
+ .b8 6
76
+ .b8 27
77
+ .b8 8
78
+ .b8 180
79
+ .b8 66
80
+ .b8 12
81
+ .b8 17
82
+ .b8 1
83
+ .b8 18
84
+ .b8 1
85
+ .b8 0
86
+ .b8 0
87
+ .b8 2
88
+ .b8 46
89
+ .b8 0
90
+ .b8 17
91
+ .b8 1
92
+ .b8 18
93
+ .b8 1
94
+ .b8 64
95
+ .b8 10
96
+ .b8 135
97
+ .b8 64
98
+ .b8 8
99
+ .b8 3
100
+ .b8 8
101
+ .b8 58
102
+ .b8 11
103
+ .b8 59
104
+ .b8 11
105
+ .b8 63
106
+ .b8 12
107
+ .b8 0
108
+ .b8 0
109
+ .b8 0
110
+ }
111
+ .section .debug_info
112
+ {
113
+ .b32 176
114
+ .b8 2
115
+ .b8 0
116
+ .b32 .debug_abbrev
117
+ .b8 8
118
+ .b8 1
119
+ .b8 116
120
+ .b8 114
121
+ .b8 105
122
+ .b8 116
123
+ .b8 111
124
+ .b8 110
125
+ .b8 0
126
+ .b8 2
127
+ .b8 0
128
+ .b8 99
129
+ .b8 122
130
+ .b8 108
131
+ .b8 54
132
+ .b8 110
133
+ .b8 109
134
+ .b8 119
135
+ .b8 97
136
+ .b8 115
137
+ .b8 108
138
+ .b8 55
139
+ .b8 107
140
+ .b8 52
141
+ .b8 105
142
+ .b8 99
143
+ .b8 53
144
+ .b8 53
145
+ .b8 120
146
+ .b8 111
147
+ .b8 119
148
+ .b8 105
149
+ .b8 104
150
+ .b8 99
151
+ .b8 122
152
+ .b8 99
153
+ .b8 111
154
+ .b8 111
155
+ .b8 104
156
+ .b8 51
157
+ .b8 109
158
+ .b8 104
159
+ .b8 117
160
+ .b8 53
161
+ .b8 118
162
+ .b8 54
163
+ .b8 108
164
+ .b8 115
165
+ .b8 54
166
+ .b8 119
167
+ .b8 50
168
+ .b8 120
169
+ .b8 122
170
+ .b8 113
171
+ .b8 113
172
+ .b8 111
173
+ .b8 99
174
+ .b8 100
175
+ .b8 99
176
+ .b8 50
177
+ .b8 100
178
+ .b8 97
179
+ .b8 55
180
+ .b8 46
181
+ .b8 112
182
+ .b8 121
183
+ .b8 0
184
+ .b32 .debug_line
185
+ .b8 47
186
+ .b8 116
187
+ .b8 109
188
+ .b8 112
189
+ .b8 47
190
+ .b8 116
191
+ .b8 111
192
+ .b8 114
193
+ .b8 99
194
+ .b8 104
195
+ .b8 105
196
+ .b8 110
197
+ .b8 100
198
+ .b8 117
199
+ .b8 99
200
+ .b8 116
201
+ .b8 111
202
+ .b8 114
203
+ .b8 95
204
+ .b8 114
205
+ .b8 111
206
+ .b8 111
207
+ .b8 116
208
+ .b8 47
209
+ .b8 122
210
+ .b8 108
211
+ .b8 0
212
+ .b8 1
213
+ .b64 $L__func_begin0
214
+ .b64 $L__func_end0
215
+ .b8 2
216
+ .b64 $L__func_begin0
217
+ .b64 $L__func_end0
218
+ .b8 1
219
+ .b8 156
220
+ .b8 116
221
+ .b8 114
222
+ .b8 105
223
+ .b8 116
224
+ .b8 111
225
+ .b8 110
226
+ .b8 95
227
+ .b8 95
228
+ .b8 48
229
+ .b8 100
230
+ .b8 49
231
+ .b8 100
232
+ .b8 50
233
+ .b8 100
234
+ .b8 101
235
+ .b8 0
236
+ .b8 116
237
+ .b8 114
238
+ .b8 105
239
+ .b8 116
240
+ .b8 111
241
+ .b8 110
242
+ .b8 95
243
+ .b8 95
244
+ .b8 48
245
+ .b8 100
246
+ .b8 49
247
+ .b8 100
248
+ .b8 50
249
+ .b8 100
250
+ .b8 101
251
+ .b8 0
252
+ .b8 1
253
+ .b8 18
254
+ .b8 1
255
+ .b8 0
256
+ }
257
+ .section .debug_pubnames
258
+ {
259
+ .b32 $L__pubNames_end0-$L__pubNames_start0
260
+ $L__pubNames_start0:
261
+ .b8 2
262
+ .b8 0
263
+ .b32 .debug_info
264
+ .b32 180
265
+ .b32 125
266
+ .b8 116
267
+ .b8 114
268
+ .b8 105
269
+ .b8 116
270
+ .b8 111
271
+ .b8 110
272
+ .b8 95
273
+ .b8 95
274
+ .b8 48
275
+ .b8 100
276
+ .b8 49
277
+ .b8 100
278
+ .b8 50
279
+ .b8 100
280
+ .b8 101
281
+ .b8 0
282
+ .b32 0
283
+ $L__pubNames_end0:
284
+ }
285
+ .section .debug_pubtypes
286
+ {
287
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
288
+ $L__pubTypes_start0:
289
+ .b8 2
290
+ .b8 0
291
+ .b32 .debug_info
292
+ .b32 180
293
+ .b32 0
294
+ $L__pubTypes_end0:
295
+ }
296
+ .section .debug_loc { }
.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ptx ADDED
@@ -0,0 +1,777 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7d8de9de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
21
+ .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
22
+ .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
23
+ )
24
+ .maxntid 64, 1, 1
25
+ {
26
+ .reg .pred %p<40>;
27
+ .reg .b16 %rs<13>;
28
+ .reg .b32 %r<118>;
29
+ .reg .f32 %f<94>;
30
+ .reg .b64 %rd<28>;
31
+ .loc 1 18 0
32
+ $L__func_begin0:
33
+ .loc 1 18 0
34
+
35
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
36
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
37
+ $L__tmp0:
38
+ .loc 1 26 26
39
+ mov.u32 %r84, %tid.x;
40
+ and.b32 %r85, %r84, 31;
41
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
42
+ ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
43
+ ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
44
+ shl.b32 %r86, %r84, 2;
45
+ ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
46
+ and.b32 %r87, %r86, 252;
47
+ ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
48
+ ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
49
+ .loc 1 23 28
50
+ mov.u32 %r1, %ctaid.x;
51
+ .loc 1 30 40
52
+ shl.b32 %r88, %r1, 8;
53
+ .loc 1 30 36
54
+ or.b32 %r89, %r88, %r87;
55
+ .loc 1 30 30
56
+ mul.wide.s32 %rd24, %r89, 2;
57
+ add.s64 %rd1, %rd17, %rd24;
58
+ mov.b32 %r4, 0;
59
+ mov.pred %p1, -1;
60
+ .loc 1 30 46
61
+ mov.u32 %r2, 0x0;
62
+ mov.u32 %r3, 0x0;
63
+ @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
64
+ @!%p1 mov.u32 %r2, %r4;
65
+ @!%p1 mov.u32 %r3, %r4;
66
+ cvt.u16.u32 %rs1, %r2;
67
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
68
+ cvt.u16.u32 %rs3, %r3;
69
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
70
+ .loc 1 30 67
71
+ cvt.f32.bf16 %r6, %rs1;
72
+ mov.b32 %f1, %r6;
73
+ cvt.f32.bf16 %r7, %rs2;
74
+ mov.b32 %f2, %r7;
75
+ cvt.f32.bf16 %r8, %rs3;
76
+ mov.b32 %f3, %r8;
77
+ cvt.f32.bf16 %r9, %rs4;
78
+ mov.b32 %f4, %r9;
79
+ .loc 1 31 30
80
+ mul.wide.u32 %rd25, %r87, 4;
81
+ add.s64 %rd2, %rd18, %rd25;
82
+ .loc 1 31 35
83
+ mov.u32 %r10, 0x0;
84
+ mov.u32 %r11, 0x0;
85
+ mov.u32 %r12, 0x0;
86
+ mov.u32 %r13, 0x0;
87
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
88
+ @!%p1 mov.u32 %r10, %r4;
89
+ @!%p1 mov.u32 %r11, %r4;
90
+ @!%p1 mov.u32 %r12, %r4;
91
+ @!%p1 mov.u32 %r13, %r4;
92
+ mov.b32 %f5, %r10;
93
+ mov.b32 %f6, %r11;
94
+ mov.b32 %f7, %r12;
95
+ mov.b32 %f8, %r13;
96
+ .loc 1 32 30
97
+ mul.wide.s32 %rd26, %r89, 4;
98
+ add.s64 %rd3, %rd19, %rd26;
99
+ .loc 1 32 46
100
+ mov.u32 %r18, 0x0;
101
+ mov.u32 %r19, 0x0;
102
+ mov.u32 %r20, 0x0;
103
+ mov.u32 %r21, 0x0;
104
+ @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
105
+ @!%p1 mov.u32 %r18, %r4;
106
+ @!%p1 mov.u32 %r19, %r4;
107
+ @!%p1 mov.u32 %r20, %r4;
108
+ @!%p1 mov.u32 %r21, %r4;
109
+ mov.b32 %f9, %r18;
110
+ mov.b32 %f10, %r19;
111
+ mov.b32 %f11, %r20;
112
+ mov.b32 %f12, %r21;
113
+ .loc 1 33 30
114
+ add.s64 %rd4, %rd20, %rd24;
115
+ .loc 1 33 46
116
+ mov.u32 %r26, 0x0;
117
+ mov.u32 %r27, 0x0;
118
+ @%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
119
+ @!%p1 mov.u32 %r26, %r4;
120
+ @!%p1 mov.u32 %r27, %r4;
121
+ cvt.u16.u32 %rs5, %r26;
122
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r26; }
123
+ cvt.u16.u32 %rs7, %r27;
124
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
125
+ .loc 1 33 67
126
+ cvt.f32.bf16 %r30, %rs5;
127
+ mov.b32 %f13, %r30;
128
+ cvt.f32.bf16 %r31, %rs6;
129
+ mov.b32 %f14, %r31;
130
+ cvt.f32.bf16 %r32, %rs7;
131
+ mov.b32 %f15, %r32;
132
+ cvt.f32.bf16 %r33, %rs8;
133
+ mov.b32 %f16, %r33;
134
+ .loc 1 34 31
135
+ mul.wide.s32 %rd27, %r1, 4;
136
+ add.s64 %rd5, %rd21, %rd27;
137
+ .loc 1 34 36
138
+ mov.u32 %r34, 0x0;
139
+ @%p1 ld.global.L1::evict_last.b32 { %r34 }, [ %rd5 + 0 ];
140
+ mov.b32 %f17, %r34;
141
+ mov.u32 %r35, 0x0;
142
+ @%p1 ld.global.L1::evict_last.b32 { %r35 }, [ %rd5 + 0 ];
143
+ mov.u32 %r36, 0x0;
144
+ @%p1 ld.global.L1::evict_last.b32 { %r36 }, [ %rd5 + 0 ];
145
+ mov.u32 %r37, 0x0;
146
+ @%p1 ld.global.L1::evict_last.b32 { %r37 }, [ %rd5 + 0 ];
147
+ .loc 1 35 31
148
+ add.s64 %rd9, %rd22, %rd27;
149
+ .loc 1 35 36
150
+ mov.u32 %r63, 0x0;
151
+ @%p1 ld.global.L1::evict_last.b32 { %r63 }, [ %rd9 + 0 ];
152
+ mov.b32 %f18, %r63;
153
+ mov.u32 %r39, 0x0;
154
+ @%p1 ld.global.L1::evict_last.b32 { %r39 }, [ %rd9 + 0 ];
155
+ mov.u32 %r40, 0x0;
156
+ @%p1 ld.global.L1::evict_last.b32 { %r40 }, [ %rd9 + 0 ];
157
+ mov.u32 %r41, 0x0;
158
+ @%p1 ld.global.L1::evict_last.b32 { %r41 }, [ %rd9 + 0 ];
159
+ .loc 1 36 35
160
+ add.s64 %rd13, %rd16, %rd26;
161
+ .loc 1 36 51
162
+ mov.u32 %r42, 0x0;
163
+ mov.u32 %r43, 0x0;
164
+ mov.u32 %r44, 0x0;
165
+ mov.u32 %r45, 0x0;
166
+ @%p1 ld.global.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd13 + 0 ];
167
+ @!%p1 mov.u32 %r42, %r4;
168
+ @!%p1 mov.u32 %r43, %r4;
169
+ @!%p1 mov.u32 %r44, %r4;
170
+ @!%p1 mov.u32 %r45, %r4;
171
+ mov.b32 %f19, %r42;
172
+ mov.b32 %f20, %r43;
173
+ mov.b32 %f21, %r44;
174
+ mov.b32 %f22, %r45;
175
+ .loc 1 38 18
176
+ mul.f32 %f23, %f1, %f5;
177
+ mul.f32 %f24, %f2, %f6;
178
+ mul.f32 %f25, %f3, %f7;
179
+ mul.f32 %f26, %f4, %f8;
180
+ $L__tmp1:
181
+ .loc 2 233 15
182
+ fma.rn.f32 %f27, %f1, %f5, %f24;
183
+ fma.rn.f32 %f28, %f3, %f7, %f27;
184
+ fma.rn.f32 %f29, %f4, %f8, %f28;
185
+ $L__tmp2:
186
+ .loc 2 243 36
187
+ mov.b32 %r90, %f29;
188
+ shfl.sync.bfly.b32 %r91, %r90, 16, 31, -1;
189
+ mov.b32 %f30, %r91;
190
+ $L__tmp3:
191
+ .loc 2 233 15
192
+ add.f32 %f31, %f29, %f30;
193
+ $L__tmp4:
194
+ .loc 2 243 36
195
+ mov.b32 %r92, %f31;
196
+ shfl.sync.bfly.b32 %r93, %r92, 8, 31, -1;
197
+ mov.b32 %f32, %r93;
198
+ $L__tmp5:
199
+ .loc 2 233 15
200
+ add.f32 %f33, %f31, %f32;
201
+ $L__tmp6:
202
+ .loc 2 243 36
203
+ mov.b32 %r94, %f33;
204
+ shfl.sync.bfly.b32 %r95, %r94, 4, 31, -1;
205
+ mov.b32 %f34, %r95;
206
+ $L__tmp7:
207
+ .loc 2 233 15
208
+ add.f32 %f35, %f33, %f34;
209
+ $L__tmp8:
210
+ .loc 2 243 36
211
+ mov.b32 %r96, %f35;
212
+ shfl.sync.bfly.b32 %r97, %r96, 2, 31, -1;
213
+ mov.b32 %f36, %r97;
214
+ $L__tmp9:
215
+ .loc 2 233 15
216
+ add.f32 %f37, %f35, %f36;
217
+ $L__tmp10:
218
+ .loc 2 243 36
219
+ mov.b32 %r98, %f37;
220
+ shfl.sync.bfly.b32 %r99, %r98, 1, 31, -1;
221
+ mov.b32 %f38, %r99;
222
+ $L__tmp11:
223
+ .loc 2 233 15
224
+ add.f32 %f39, %f37, %f38;
225
+ $L__tmp12:
226
+ .loc 2 243 36
227
+ setp.eq.s32 %p30, %r85, 0;
228
+ shr.u32 %r100, %r84, 3;
229
+ and.b32 %r101, %r100, 4;
230
+ mov.u32 %r102, global_smem;
231
+ add.s32 %r50, %r102, %r101;
232
+ mov.b32 %r51, %f39;
233
+ @%p30 st.shared.b32 [ %r50 + 0 ], %r51;
234
+ bar.sync 0;
235
+ setp.lt.s32 %p31, %r84, 2;
236
+ add.s32 %r53, %r102, %r86;
237
+ @%p31 ld.shared.b32 %r52, [ %r53 + 0 ];
238
+ mov.b32 %f40, %r52;
239
+ shfl.sync.bfly.b32 %r103, %r52, 1, 31, -1;
240
+ mov.b32 %f41, %r103;
241
+ $L__tmp13:
242
+ .loc 2 233 15
243
+ add.f32 %f42, %f40, %f41;
244
+ $L__tmp14:
245
+ .loc 2 243 36
246
+ and.b32 %r104, %r84, 1;
247
+ setp.eq.b32 %p38, %r104, 1;
248
+ not.pred %p39, %p38;
249
+ and.pred %p32, %p31, %p39;
250
+ mov.b32 %r55, %f42;
251
+ @%p32 st.shared.b32 [ %r53 + 0 ], %r55;
252
+ bar.sync 0;
253
+ ld.shared.f32 %f43, [global_smem];
254
+ $L__tmp15:
255
+ .loc 3 8 15
256
+ add.f32 %f44, %f43, 0f00000000;
257
+ $L__tmp16:
258
+ .loc 1 43 19
259
+ add.f32 %f45, %f13, %f9;
260
+ add.f32 %f46, %f14, %f10;
261
+ add.f32 %f47, %f15, %f11;
262
+ add.f32 %f48, %f16, %f12;
263
+ .loc 1 44 20
264
+ sub.f32 %f49, %f45, %f17;
265
+ sub.f32 %f50, %f46, %f17;
266
+ sub.f32 %f51, %f47, %f17;
267
+ sub.f32 %f52, %f48, %f17;
268
+ .loc 1 45 20
269
+ mul.f32 %f53, %f49, %f18;
270
+ mul.f32 %f54, %f50, %f18;
271
+ mul.f32 %f55, %f51, %f18;
272
+ mul.f32 %f56, %f52, %f18;
273
+ .loc 1 46 19
274
+ mul.f32 %f57, %f24, %f54;
275
+ $L__tmp17:
276
+ .loc 2 243 36
277
+ bar.sync 0;
278
+ $L__tmp18:
279
+ .loc 2 233 15
280
+ fma.rn.f32 %f58, %f23, %f53, %f57;
281
+ fma.rn.f32 %f59, %f25, %f55, %f58;
282
+ fma.rn.f32 %f60, %f26, %f56, %f59;
283
+ $L__tmp19:
284
+ .loc 2 243 36
285
+ mov.b32 %r105, %f60;
286
+ shfl.sync.bfly.b32 %r106, %r105, 16, 31, -1;
287
+ mov.b32 %f61, %r106;
288
+ $L__tmp20:
289
+ .loc 2 233 15
290
+ add.f32 %f62, %f60, %f61;
291
+ $L__tmp21:
292
+ .loc 2 243 36
293
+ mov.b32 %r107, %f62;
294
+ shfl.sync.bfly.b32 %r108, %r107, 8, 31, -1;
295
+ mov.b32 %f63, %r108;
296
+ $L__tmp22:
297
+ .loc 2 233 15
298
+ add.f32 %f64, %f62, %f63;
299
+ $L__tmp23:
300
+ .loc 2 243 36
301
+ mov.b32 %r109, %f64;
302
+ shfl.sync.bfly.b32 %r110, %r109, 4, 31, -1;
303
+ mov.b32 %f65, %r110;
304
+ $L__tmp24:
305
+ .loc 2 233 15
306
+ add.f32 %f66, %f64, %f65;
307
+ $L__tmp25:
308
+ .loc 2 243 36
309
+ mov.b32 %r111, %f66;
310
+ shfl.sync.bfly.b32 %r112, %r111, 2, 31, -1;
311
+ mov.b32 %f67, %r112;
312
+ $L__tmp26:
313
+ .loc 2 233 15
314
+ add.f32 %f68, %f66, %f67;
315
+ $L__tmp27:
316
+ .loc 2 243 36
317
+ mov.b32 %r113, %f68;
318
+ shfl.sync.bfly.b32 %r114, %r113, 1, 31, -1;
319
+ mov.b32 %f69, %r114;
320
+ $L__tmp28:
321
+ .loc 2 233 15
322
+ add.f32 %f70, %f68, %f69;
323
+ $L__tmp29:
324
+ .loc 2 243 36
325
+ mov.b32 %r57, %f70;
326
+ @%p30 st.shared.b32 [ %r50 + 0 ], %r57;
327
+ bar.sync 0;
328
+ @%p31 ld.shared.b32 %r58, [ %r53 + 0 ];
329
+ mov.b32 %f71, %r58;
330
+ shfl.sync.bfly.b32 %r115, %r58, 1, 31, -1;
331
+ mov.b32 %f72, %r115;
332
+ $L__tmp30:
333
+ .loc 2 233 15
334
+ add.f32 %f73, %f71, %f72;
335
+ $L__tmp31:
336
+ .loc 2 243 36
337
+ mov.b32 %r61, %f73;
338
+ @%p32 st.shared.b32 [ %r53 + 0 ], %r61;
339
+ bar.sync 0;
340
+ ld.shared.f32 %f74, [global_smem];
341
+ $L__tmp32:
342
+ .loc 3 8 15
343
+ add.f32 %f75, %f74, 0f00000000;
344
+ mov.b32 %r64, 1132462080;
345
+ $L__tmp33:
346
+ .loc 1 51 20
347
+ div.full.f32 %r62, %r63, %r64;
348
+ mov.b32 %f76, %r62;
349
+ .loc 1 53 20
350
+ neg.f32 %f77, %f44;
351
+ fma.rn.f32 %f78, %f23, 0f43800000, %f77;
352
+ fma.rn.f32 %f79, %f24, 0f43800000, %f77;
353
+ fma.rn.f32 %f80, %f25, 0f43800000, %f77;
354
+ fma.rn.f32 %f81, %f26, 0f43800000, %f77;
355
+ .loc 1 55 20
356
+ neg.f32 %f82, %f53;
357
+ fma.rn.f32 %f83, %f82, %f75, %f78;
358
+ neg.f32 %f84, %f54;
359
+ fma.rn.f32 %f85, %f84, %f75, %f79;
360
+ neg.f32 %f86, %f55;
361
+ fma.rn.f32 %f87, %f86, %f75, %f80;
362
+ neg.f32 %f88, %f56;
363
+ fma.rn.f32 %f89, %f88, %f75, %f81;
364
+ .loc 1 57 20
365
+ fma.rn.f32 %f90, %f76, %f83, %f19;
366
+ fma.rn.f32 %f91, %f76, %f85, %f20;
367
+ fma.rn.f32 %f92, %f76, %f87, %f21;
368
+ fma.rn.f32 %f93, %f76, %f89, %f22;
369
+ .loc 1 59 51
370
+ mov.b32 %r74, %f90;
371
+ mov.b32 %r75, %f91;
372
+ mov.b32 %r76, %f92;
373
+ mov.b32 %r77, %f93;
374
+ @%p1 st.global.v4.b32 [ %rd13 + 0 ], { %r74, %r75, %r76, %r77 };
375
+ .loc 1 60 25
376
+ add.s64 %rd15, %rd23, %rd24;
377
+ .loc 1 60 48
378
+ cvt.rn.bf16.f32 %rs9, %r74;
379
+ cvt.rn.bf16.f32 %rs10, %r75;
380
+ cvt.rn.bf16.f32 %rs11, %r76;
381
+ cvt.rn.bf16.f32 %rs12, %r77;
382
+ mov.b32 %r116, {%rs9, %rs10};
383
+ mov.b32 %r117, {%rs11, %rs12};
384
+ @%p1 st.global.v2.b32 [ %rd15 + 0 ], { %r116, %r117 };
385
+ .loc 1 60 4
386
+ ret;
387
+ $L__tmp34:
388
+ $L__func_end0:
389
+
390
+ }
391
+ .file 1 "/tmp/torchinductor_root/fh/cfhjzwujbd4bpel57x4hxw7d3m3qqfwrjg6bfe6e4wk2cyh77u45.py"
392
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
393
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
394
+ .section .debug_abbrev
395
+ {
396
+ .b8 1
397
+ .b8 17
398
+ .b8 1
399
+ .b8 37
400
+ .b8 8
401
+ .b8 19
402
+ .b8 5
403
+ .b8 3
404
+ .b8 8
405
+ .b8 16
406
+ .b8 6
407
+ .b8 27
408
+ .b8 8
409
+ .b8 180
410
+ .b8 66
411
+ .b8 12
412
+ .b8 17
413
+ .b8 1
414
+ .b8 18
415
+ .b8 1
416
+ .b8 0
417
+ .b8 0
418
+ .b8 2
419
+ .b8 46
420
+ .b8 0
421
+ .b8 135
422
+ .b8 64
423
+ .b8 8
424
+ .b8 3
425
+ .b8 8
426
+ .b8 58
427
+ .b8 11
428
+ .b8 59
429
+ .b8 11
430
+ .b8 63
431
+ .b8 12
432
+ .b8 32
433
+ .b8 11
434
+ .b8 0
435
+ .b8 0
436
+ .b8 3
437
+ .b8 46
438
+ .b8 1
439
+ .b8 17
440
+ .b8 1
441
+ .b8 18
442
+ .b8 1
443
+ .b8 64
444
+ .b8 10
445
+ .b8 49
446
+ .b8 19
447
+ .b8 0
448
+ .b8 0
449
+ .b8 4
450
+ .b8 29
451
+ .b8 1
452
+ .b8 49
453
+ .b8 19
454
+ .b8 17
455
+ .b8 1
456
+ .b8 18
457
+ .b8 1
458
+ .b8 88
459
+ .b8 11
460
+ .b8 89
461
+ .b8 11
462
+ .b8 87
463
+ .b8 11
464
+ .b8 0
465
+ .b8 0
466
+ .b8 5
467
+ .b8 29
468
+ .b8 0
469
+ .b8 49
470
+ .b8 19
471
+ .b8 17
472
+ .b8 1
473
+ .b8 18
474
+ .b8 1
475
+ .b8 88
476
+ .b8 11
477
+ .b8 89
478
+ .b8 11
479
+ .b8 87
480
+ .b8 11
481
+ .b8 0
482
+ .b8 0
483
+ .b8 0
484
+ }
485
+ .section .debug_info
486
+ {
487
+ .b32 407
488
+ .b8 2
489
+ .b8 0
490
+ .b32 .debug_abbrev
491
+ .b8 8
492
+ .b8 1
493
+ .b8 116
494
+ .b8 114
495
+ .b8 105
496
+ .b8 116
497
+ .b8 111
498
+ .b8 110
499
+ .b8 0
500
+ .b8 2
501
+ .b8 0
502
+ .b8 99
503
+ .b8 102
504
+ .b8 104
505
+ .b8 106
506
+ .b8 122
507
+ .b8 119
508
+ .b8 117
509
+ .b8 106
510
+ .b8 98
511
+ .b8 100
512
+ .b8 52
513
+ .b8 98
514
+ .b8 112
515
+ .b8 101
516
+ .b8 108
517
+ .b8 53
518
+ .b8 55
519
+ .b8 120
520
+ .b8 52
521
+ .b8 104
522
+ .b8 120
523
+ .b8 119
524
+ .b8 55
525
+ .b8 100
526
+ .b8 51
527
+ .b8 109
528
+ .b8 51
529
+ .b8 113
530
+ .b8 113
531
+ .b8 102
532
+ .b8 119
533
+ .b8 114
534
+ .b8 106
535
+ .b8 103
536
+ .b8 54
537
+ .b8 98
538
+ .b8 102
539
+ .b8 101
540
+ .b8 54
541
+ .b8 101
542
+ .b8 52
543
+ .b8 119
544
+ .b8 107
545
+ .b8 50
546
+ .b8 99
547
+ .b8 121
548
+ .b8 104
549
+ .b8 55
550
+ .b8 55
551
+ .b8 117
552
+ .b8 52
553
+ .b8 53
554
+ .b8 46
555
+ .b8 112
556
+ .b8 121
557
+ .b8 0
558
+ .b32 .debug_line
559
+ .b8 47
560
+ .b8 116
561
+ .b8 109
562
+ .b8 112
563
+ .b8 47
564
+ .b8 116
565
+ .b8 111
566
+ .b8 114
567
+ .b8 99
568
+ .b8 104
569
+ .b8 105
570
+ .b8 110
571
+ .b8 100
572
+ .b8 117
573
+ .b8 99
574
+ .b8 116
575
+ .b8 111
576
+ .b8 114
577
+ .b8 95
578
+ .b8 114
579
+ .b8 111
580
+ .b8 111
581
+ .b8 116
582
+ .b8 47
583
+ .b8 102
584
+ .b8 104
585
+ .b8 0
586
+ .b8 1
587
+ .b64 $L__func_begin0
588
+ .b64 $L__func_end0
589
+ .b8 2
590
+ .b8 116
591
+ .b8 114
592
+ .b8 105
593
+ .b8 116
594
+ .b8 111
595
+ .b8 110
596
+ .b8 95
597
+ .b8 95
598
+ .b8 48
599
+ .b8 100
600
+ .b8 49
601
+ .b8 100
602
+ .b8 50
603
+ .b8 100
604
+ .b8 51
605
+ .b8 100
606
+ .b8 52
607
+ .b8 100
608
+ .b8 53
609
+ .b8 100
610
+ .b8 54
611
+ .b8 100
612
+ .b8 55
613
+ .b8 100
614
+ .b8 56
615
+ .b8 100
616
+ .b8 101
617
+ .b8 57
618
+ .b8 100
619
+ .b8 101
620
+ .b8 0
621
+ .b8 116
622
+ .b8 114
623
+ .b8 105
624
+ .b8 116
625
+ .b8 111
626
+ .b8 110
627
+ .b8 95
628
+ .b8 95
629
+ .b8 48
630
+ .b8 100
631
+ .b8 49
632
+ .b8 100
633
+ .b8 50
634
+ .b8 100
635
+ .b8 51
636
+ .b8 100
637
+ .b8 52
638
+ .b8 100
639
+ .b8 53
640
+ .b8 100
641
+ .b8 54
642
+ .b8 100
643
+ .b8 55
644
+ .b8 100
645
+ .b8 56
646
+ .b8 100
647
+ .b8 101
648
+ .b8 57
649
+ .b8 100
650
+ .b8 101
651
+ .b8 0
652
+ .b8 1
653
+ .b8 18
654
+ .b8 1
655
+ .b8 1
656
+ .b8 3
657
+ .b64 $L__func_begin0
658
+ .b64 $L__func_end0
659
+ .b8 1
660
+ .b8 156
661
+ .b32 125
662
+ .b8 4
663
+ .b32 125
664
+ .b64 $L__tmp1
665
+ .b64 $L__tmp14
666
+ .b8 2
667
+ .b8 41
668
+ .b8 57
669
+ .b8 5
670
+ .b32 125
671
+ .b64 $L__tmp1
672
+ .b64 $L__tmp14
673
+ .b8 2
674
+ .b8 243
675
+ .b8 36
676
+ .b8 0
677
+ .b8 5
678
+ .b32 125
679
+ .b64 $L__tmp2
680
+ .b64 $L__tmp15
681
+ .b8 2
682
+ .b8 41
683
+ .b8 57
684
+ .b8 5
685
+ .b32 125
686
+ .b64 $L__tmp15
687
+ .b64 $L__tmp16
688
+ .b8 3
689
+ .b8 41
690
+ .b8 44
691
+ .b8 5
692
+ .b32 125
693
+ .b64 $L__tmp17
694
+ .b64 $L__tmp32
695
+ .b8 2
696
+ .b8 49
697
+ .b8 59
698
+ .b8 4
699
+ .b32 125
700
+ .b64 $L__tmp18
701
+ .b64 $L__tmp31
702
+ .b8 2
703
+ .b8 49
704
+ .b8 59
705
+ .b8 5
706
+ .b32 125
707
+ .b64 $L__tmp18
708
+ .b64 $L__tmp31
709
+ .b8 2
710
+ .b8 243
711
+ .b8 36
712
+ .b8 0
713
+ .b8 5
714
+ .b32 125
715
+ .b64 $L__tmp32
716
+ .b64 $L__tmp33
717
+ .b8 3
718
+ .b8 49
719
+ .b8 45
720
+ .b8 0
721
+ .b8 0
722
+ }
723
+ .section .debug_pubnames
724
+ {
725
+ .b32 $L__pubNames_end0-$L__pubNames_start0
726
+ $L__pubNames_start0:
727
+ .b8 2
728
+ .b8 0
729
+ .b32 .debug_info
730
+ .b32 411
731
+ .b32 125
732
+ .b8 116
733
+ .b8 114
734
+ .b8 105
735
+ .b8 116
736
+ .b8 111
737
+ .b8 110
738
+ .b8 95
739
+ .b8 95
740
+ .b8 48
741
+ .b8 100
742
+ .b8 49
743
+ .b8 100
744
+ .b8 50
745
+ .b8 100
746
+ .b8 51
747
+ .b8 100
748
+ .b8 52
749
+ .b8 100
750
+ .b8 53
751
+ .b8 100
752
+ .b8 54
753
+ .b8 100
754
+ .b8 55
755
+ .b8 100
756
+ .b8 56
757
+ .b8 100
758
+ .b8 101
759
+ .b8 57
760
+ .b8 100
761
+ .b8 101
762
+ .b8 0
763
+ .b32 0
764
+ $L__pubNames_end0:
765
+ }
766
+ .section .debug_pubtypes
767
+ {
768
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
769
+ $L__pubTypes_start0:
770
+ .b8 2
771
+ .b8 0
772
+ .b32 .debug_info
773
+ .b32 411
774
+ .b32 0
775
+ $L__pubTypes_end0:
776
+ }
777
+ .section .debug_loc { }
.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ttgir ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
6
+ %cst_1 = arith.constant 0.000000e+00 : f32
7
+ %c256_i32 = arith.constant 256 : i32
8
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
9
+ %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
20
+ %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
21
+ %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
22
+ %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
23
+ %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
27
+ %16 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
28
+ %17 = tt.addptr %16, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
29
+ %18 = tt.load %17, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
30
+ %19 = arith.extf %18 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
31
+ %20 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
32
+ %21 = tt.splat %20 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
33
+ %22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
34
+ %23 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
35
+ %24 = tt.splat %23 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
36
+ %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
37
+ %26 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
38
+ %27 = tt.addptr %26, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
39
+ %28 = tt.load %27, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
40
+ %29 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
41
+ %30 = arith.select %2, %29, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
42
+ %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
43
+ ^bb0(%arg10: f32, %arg11: f32):
44
+ %55 = arith.addf %arg10, %arg11 : f32
45
+ tt.reduce.return %55 : f32
46
+ }) : (tensor<256xf32, #blocked>) -> f32
47
+ %32 = arith.addf %31, %cst_1 : f32
48
+ %33 = arith.addf %15, %19 : tensor<256xf32, #blocked>
49
+ %34 = tt.broadcast %22 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
50
+ %35 = arith.subf %33, %34 : tensor<256xf32, #blocked>
51
+ %36 = tt.broadcast %25 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
52
+ %37 = arith.mulf %35, %36 : tensor<256xf32, #blocked>
53
+ %38 = arith.mulf %29, %37 : tensor<256xf32, #blocked>
54
+ %39 = arith.select %2, %38, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
55
+ %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({
56
+ ^bb0(%arg10: f32, %arg11: f32):
57
+ %55 = arith.addf %arg10, %arg11 : f32
58
+ tt.reduce.return %55 : f32
59
+ }) : (tensor<256xf32, #blocked>) -> f32
60
+ %41 = arith.addf %40, %cst_1 : f32
61
+ %42 = arith.divf %25, %cst_0 : tensor<1xf32, #blocked>
62
+ %43 = arith.mulf %29, %cst_3 : tensor<256xf32, #blocked>
63
+ %44 = tt.splat %32 : (f32) -> tensor<256xf32, #blocked>
64
+ %45 = arith.subf %43, %44 : tensor<256xf32, #blocked>
65
+ %46 = tt.splat %41 : (f32) -> tensor<256xf32, #blocked>
66
+ %47 = arith.mulf %37, %46 : tensor<256xf32, #blocked>
67
+ %48 = arith.subf %45, %47 : tensor<256xf32, #blocked>
68
+ %49 = tt.broadcast %42 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
69
+ %50 = arith.mulf %49, %48 : tensor<256xf32, #blocked>
70
+ %51 = arith.addf %28, %50 : tensor<256xf32, #blocked>
71
+ tt.store %27, %51, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
72
+ %52 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
73
+ %53 = tt.addptr %52, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
74
+ %54 = arith.truncf %51 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
75
+ tt.store %53, %54, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
76
+ tt.return
77
+ }
78
+ }
.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttgir ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
6
+ %cst_0 = arith.constant dense<0> : tensor<1xi64, #blocked>
7
+ %cst_1 = arith.constant dense<50257> : tensor<1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked>
9
+ %cst_3 = arith.constant 9.99999974E-6 : f32
10
+ %cst_4 = arith.constant 2.560000e+02 : f32
11
+ %cst_5 = arith.constant 0.000000e+00 : f32
12
+ %c256_i32 = arith.constant 256 : i32
13
+ %c512_i32 = arith.constant 512 : i32
14
+ %cst_6 = arith.constant dense<50257> : tensor<1xi64, #blocked1>
15
+ %cst_7 = arith.constant dense<0> : tensor<1xi64, #blocked1>
16
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
17
+ %cst_9 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
18
+ %0 = tt.get_program_id x : i32
19
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
20
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
21
+ %3 = arith.remsi %0, %c512_i32 : i32
22
+ %4 = tt.addptr %arg0, %0 : !tt.ptr<i64, 1>, i32
23
+ %5 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked>
24
+ %6 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked1>
25
+ %7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked>
26
+ %8 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked1>
27
+ %9 = arith.muli %3, %c256_i32 : i32
28
+ %10 = tt.splat %9 : (i32) -> tensor<256xi32, #blocked>
29
+ %11 = arith.addi %1, %10 : tensor<256xi32, #blocked>
30
+ %12 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
31
+ %13 = tt.addptr %12, %11 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
32
+ %14 = tt.load %13, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
33
+ %15 = arith.muli %0, %c256_i32 : i32
34
+ %16 = tt.splat %15 : (i32) -> tensor<256xi32, #blocked>
35
+ %17 = arith.addi %1, %16 : tensor<256xi32, #blocked>
36
+ %18 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
37
+ %19 = tt.addptr %18, %17 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
38
+ %20 = tt.load %19, %2, %cst_9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
39
+ %21 = arith.extf %20 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
40
+ %22 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
41
+ %23 = tt.addptr %22, %17 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
42
+ %24 = tt.load %23, %2, %cst_9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
43
+ %25 = arith.extf %24 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
44
+ %26 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
45
+ %27 = tt.addptr %26, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
46
+ %28 = tt.load %27, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
47
+ %29 = arith.addi %7, %cst_1 : tensor<1xi64, #blocked>
48
+ %30 = arith.addi %8, %cst_6 : tensor<1xi64, #blocked1>
49
+ %31 = arith.cmpi slt, %7, %cst_0 : tensor<1xi64, #blocked>
50
+ %32 = arith.cmpi slt, %8, %cst_7 : tensor<1xi64, #blocked1>
51
+ %33 = arith.select %31, %29, %7 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked>
52
+ %34 = arith.select %32, %30, %8 : tensor<1xi1, #blocked1>, tensor<1xi64, #blocked1>
53
+ %35 = arith.cmpi sge, %34, %cst_7 : tensor<1xi64, #blocked1>
54
+ %36 = arith.cmpi slt, %34, %cst_6 : tensor<1xi64, #blocked1>
55
+ %37 = arith.andi %35, %36 : tensor<1xi1, #blocked1>
56
+ tt.assert %37, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1xi1, #blocked1>
57
+ %38 = arith.muli %33, %cst_2 : tensor<1xi64, #blocked>
58
+ %39 = tt.broadcast %38 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked>
59
+ %40 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
60
+ %41 = arith.addi %40, %39 : tensor<256xi64, #blocked>
61
+ %42 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
62
+ %43 = tt.addptr %42, %41 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi64, #blocked>
63
+ %44 = tt.load %43, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
64
+ %45 = arith.addf %44, %14 : tensor<256xf32, #blocked>
65
+ %46 = arith.addf %45, %21 : tensor<256xf32, #blocked>
66
+ %47 = arith.addf %46, %25 : tensor<256xf32, #blocked>
67
+ %48 = arith.select %2, %47, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
68
+ %49 = "tt.reduce"(%48) <{axis = 0 : i32}> ({
69
+ ^bb0(%arg10: f32, %arg11: f32):
70
+ %69 = arith.addf %arg10, %arg11 : f32
71
+ tt.reduce.return %69 : f32
72
+ }) : (tensor<256xf32, #blocked>) -> f32
73
+ %50 = arith.addf %49, %cst_5 : f32
74
+ %51 = arith.divf %50, %cst_4 : f32
75
+ %52 = tt.splat %51 : (f32) -> tensor<256xf32, #blocked>
76
+ %53 = arith.subf %47, %52 : tensor<256xf32, #blocked>
77
+ %54 = arith.mulf %53, %53 : tensor<256xf32, #blocked>
78
+ %55 = arith.select %2, %54, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
79
+ %56 = "tt.reduce"(%55) <{axis = 0 : i32}> ({
80
+ ^bb0(%arg10: f32, %arg11: f32):
81
+ %69 = arith.addf %arg10, %arg11 : f32
82
+ tt.reduce.return %69 : f32
83
+ }) : (tensor<256xf32, #blocked>) -> f32
84
+ %57 = arith.addf %56, %cst_5 : f32
85
+ %58 = arith.divf %57, %cst_4 : f32
86
+ %59 = arith.addf %58, %cst_3 : f32
87
+ %60 = tt.extern_elementwise %59 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
88
+ %61 = tt.splat %60 : (f32) -> tensor<256xf32, #blocked>
89
+ %62 = arith.mulf %53, %61 : tensor<256xf32, #blocked>
90
+ %63 = arith.mulf %62, %28 : tensor<256xf32, #blocked>
91
+ %64 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
92
+ %65 = tt.addptr %64, %17 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
93
+ tt.store %65, %47, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
94
+ %66 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
95
+ %67 = tt.addptr %66, %17 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
96
+ %68 = arith.truncf %63 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
97
+ tt.store %67, %68, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
98
+ tt.return
99
+ }
100
+ }
.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ttgir ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked>
5
+ %c512_i32 = arith.constant 512 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c512_i32 : i32
8
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
9
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
10
+ %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
11
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
12
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
13
+ tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
14
+ tt.return
15
+ }
16
+ }