diff --git a/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..f4de6b2a61f15fddbd170b22fadd9cc7a4e9e800 --- /dev/null +++ b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir @@ -0,0 +1,366 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !7 { + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %12 = and i32 %11, 31, !dbg !10 + %13 = lshr i32 %11, 5, !dbg !10 + %14 = and i32 %13, 1, !dbg !10 + %urem = shl i32 %11, 2, !dbg !10 + %15 = and i32 %urem, 252, !dbg !10 + %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11 + %17 = shl i32 %16, 8, !dbg !12 + %18 = or i32 %17, %15, !dbg !13 + %19 = sext i32 %18 to i64, !dbg !14 + %20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !14 + %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15 + %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !15 + %23 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !15 + %24 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !15 + %25 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !15 + %26 = bitcast i32 %22 to float, !dbg !15 + %27 = bitcast i32 %23 to float, !dbg !15 + %28 = bitcast i32 %24 to float, !dbg !15 + %29 = bitcast i32 %25 to float, !dbg !15 + %30 = getelementptr i16, ptr addrspace(1) %1, i64 %19, !dbg !16 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !17 + %33 = extractvalue { i32, i32 } %31, 1, !dbg !17 + %34 = trunc i32 %32 to i16, !dbg !17 + %extelt.offset = lshr i32 %32, 16, !dbg !17 + %35 = trunc i32 %extelt.offset to i16, !dbg !17 + %36 = trunc i32 %33 to i16, !dbg !17 + %extelt.offset1 = lshr i32 %33, 16, !dbg !17 + %37 = trunc i32 %extelt.offset1 to i16, !dbg !17 + %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18 + %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #6, !dbg !18 + %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #6, !dbg !18 + %41 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %37) #6, !dbg !18 + %42 = getelementptr i16, ptr addrspace(1) %2, i64 %19, !dbg !19 + %43 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20 + %44 = extractvalue { i32, i32 } %43, 0, !dbg !20 + %45 = extractvalue { i32, i32 } %43, 1, !dbg !20 + %46 = trunc i32 %44 to i16, !dbg !20 + %extelt.offset2 = lshr i32 %44, 16, !dbg !20 + %47 = trunc i32 %extelt.offset2 to i16, !dbg !20 + %48 = trunc i32 %45 to i16, !dbg !20 + %extelt.offset3 = lshr i32 %45, 16, !dbg !20 + %49 = trunc i32 %extelt.offset3 to i16, !dbg !20 + %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !21 + %51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !21 + %52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #6, !dbg !21 + %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #6, !dbg !21 + %54 = getelementptr i16, ptr addrspace(1) %3, i64 %19, !dbg !22 + %55 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23 + %56 = extractvalue { i32, i32 } %55, 0, !dbg !23 + %57 = extractvalue { i32, i32 } %55, 1, !dbg !23 + %58 = trunc i32 %56 to i16, !dbg !23 + %extelt.offset4 = lshr i32 %56, 16, !dbg !23 + %59 = trunc i32 %extelt.offset4 to i16, !dbg !23 + %60 = trunc i32 %57 to i16, !dbg !23 + %extelt.offset5 = lshr i32 %57, 16, !dbg !23 + %61 = trunc i32 %extelt.offset5 to i16, !dbg !23 + %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #6, !dbg !24 + %63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #6, !dbg !24 + %64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #6, !dbg !24 + %65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %61) #6, !dbg !24 + %66 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !25 + %67 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %66, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26 + %68 = extractvalue { i32, i32 } %67, 0, !dbg !26 + %69 = extractvalue { i32, i32 } %67, 1, !dbg !26 + %70 = trunc i32 %68 to i16, !dbg !26 + %extelt.offset6 = lshr i32 %68, 16, !dbg !26 + %71 = trunc i32 %extelt.offset6 to i16, !dbg !26 + %72 = trunc i32 %69 to i16, !dbg !26 + %extelt.offset7 = lshr i32 %69, 16, !dbg !26 + %73 = trunc i32 %extelt.offset7 to i16, !dbg !26 + %74 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #6, !dbg !27 + %75 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #6, !dbg !27 + %76 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #6, !dbg !27 + %77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #6, !dbg !27 + %78 = zext nneg i32 %15 to i64, !dbg !28 + %79 = getelementptr float, ptr addrspace(1) %5, i64 %78, !dbg !28 + %80 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29 + %81 = fadd float %38, %26, !dbg !30 + %82 = fadd float %39, %27, !dbg !30 + %83 = fadd float %40, %28, !dbg !30 + %84 = fadd float %81, %50, !dbg !31 + %85 = fadd float %82, %51, !dbg !31 + %86 = fadd float %83, %52, !dbg !31 + %87 = fadd float %85, %63, !dbg !32 + %88 = fadd float %86, %64, !dbg !32 + %89 = fadd float %87, %75, !dbg !33 + %90 = fadd float %88, %76, !dbg !33 + %91 = insertelement <2 x float> poison, float %84, i64 0, !dbg !32 + %92 = insertelement <2 x float> %91, float %41, i64 1, !dbg !32 + %93 = insertelement <2 x float> poison, float %62, i64 0, !dbg !32 + %94 = insertelement <2 x float> %93, float %29, i64 1, !dbg !32 + %95 = fadd <2 x float> %92, %94, !dbg !32 + %96 = insertelement <2 x float> poison, float %74, i64 0, !dbg !33 + %97 = insertelement <2 x float> %96, float %53, i64 1, !dbg !33 + %98 = fadd <2 x float> %95, %97, !dbg !33 + %99 = insertelement <2 x float> poison, float %89, i64 0, !dbg !34 + %100 = insertelement <2 x float> %99, float %65, i64 1, !dbg !34 + %101 = fadd <2 x float> %98, %100, !dbg !34 + %102 = insertelement <2 x float> poison, float %90, i64 0, !dbg !34 + %103 = insertelement <2 x float> %102, float %77, i64 1, !dbg !34 + %104 = fadd <2 x float> %101, %103, !dbg !34 + %105 = extractelement <2 x float> %104, i64 0, !dbg !34 + %106 = extractelement <2 x float> %104, i64 1, !dbg !34 + %107 = fadd float %105, %106, !dbg !34 + %108 = bitcast float %107 to i32, !dbg !40 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !40 + %110 = bitcast i32 %109 to float, !dbg !40 + %111 = fadd float %107, %110, !dbg !34 + %112 = bitcast float %111 to i32, !dbg !40 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !40 + %114 = bitcast i32 %113 to float, !dbg !40 + %115 = fadd float %111, %114, !dbg !34 + %116 = bitcast float %115 to i32, !dbg !40 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 4, i32 31), !dbg !40 + %118 = bitcast i32 %117 to float, !dbg !40 + %119 = fadd float %115, %118, !dbg !34 + %120 = bitcast float %119 to i32, !dbg !40 + %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 2, i32 31), !dbg !40 + %122 = bitcast i32 %121 to float, !dbg !40 + %123 = fadd float %119, %122, !dbg !34 + %124 = bitcast float %123 to i32, !dbg !40 + %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 1, i32 31), !dbg !40 + %126 = bitcast i32 %125 to float, !dbg !40 + %127 = fadd float %123, %126, !dbg !34 + %128 = icmp eq i32 %12, 0, !dbg !40 + %129 = zext nneg i32 %14 to i64, !dbg !40 + %130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %127, i1 %128) #6, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !40 + %131 = icmp slt i32 %11, 2, !dbg !40 + %132 = sext i32 %11 to i64, !dbg !40 + %133 = getelementptr float, ptr addrspace(3) @global_smem, i64 %132, !dbg !40 + %134 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !40 + %135 = bitcast float %134 to i32, !dbg !40 + %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 1, i32 31), !dbg !40 + %137 = bitcast i32 %136 to float, !dbg !40 + %138 = fadd float %134, %137, !dbg !34 + %139 = and i32 %11, 1, !dbg !40 + %140 = icmp eq i32 %139, 0, !dbg !40 + %141 = and i1 %131, %140, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %138, i1 %141) #6, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !40 + %142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !40 + %143 = fadd float %142, 0.000000e+00, !dbg !42 + %144 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %143, float 2.560000e+02) #6, !dbg !46 + %145 = extractelement <2 x float> %98, i64 0, !dbg !47 + %146 = fsub float %145, %144, !dbg !47 + %147 = fsub float %89, %144, !dbg !47 + %148 = fsub float %90, %144, !dbg !47 + %149 = fsub float %106, %144, !dbg !47 + %150 = fmul float %146, %146, !dbg !48 + %151 = fmul float %147, %147, !dbg !48 + %152 = fmul float %148, %148, !dbg !48 + %153 = fmul float %149, %149, !dbg !48 + tail call void @llvm.nvvm.barrier0(), !dbg !49 + %154 = fadd float %150, %151, !dbg !51 + %155 = fadd float %152, %154, !dbg !51 + %156 = fadd float %153, %155, !dbg !51 + %157 = bitcast float %156 to i32, !dbg !49 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !49 + %159 = bitcast i32 %158 to float, !dbg !49 + %160 = fadd float %156, %159, !dbg !51 + %161 = bitcast float %160 to i32, !dbg !49 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !49 + %163 = bitcast i32 %162 to float, !dbg !49 + %164 = fadd float %160, %163, !dbg !51 + %165 = bitcast float %164 to i32, !dbg !49 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 4, i32 31), !dbg !49 + %167 = bitcast i32 %166 to float, !dbg !49 + %168 = fadd float %164, %167, !dbg !51 + %169 = bitcast float %168 to i32, !dbg !49 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 2, i32 31), !dbg !49 + %171 = bitcast i32 %170 to float, !dbg !49 + %172 = fadd float %168, %171, !dbg !51 + %173 = bitcast float %172 to i32, !dbg !49 + %174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 1, i32 31), !dbg !49 + %175 = bitcast i32 %174 to float, !dbg !49 + %176 = fadd float %172, %175, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %176, i1 %128) #6, !dbg !49 + tail call void @llvm.nvvm.barrier0(), !dbg !49 + %177 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !49 + %178 = bitcast float %177 to i32, !dbg !49 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !49 + %180 = bitcast i32 %179 to float, !dbg !49 + %181 = fadd float %177, %180, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %181, i1 %141) #6, !dbg !49 + tail call void @llvm.nvvm.barrier0(), !dbg !49 + %182 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !49 + %183 = fadd float %182, 0.000000e+00, !dbg !54 + %184 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float 2.560000e+02) #6, !dbg !56 + %185 = fadd float %184, 0x3EE4F8B580000000, !dbg !57 + %186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i = icmp eq i32 %186, 0, !dbg !58 + br i1 %.not.i, label %189, label %187, !dbg !58 + +187: ; preds = %10 + %188 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %185), !dbg !58 + br label %__nv_rsqrtf.exit, !dbg !58 + +189: ; preds = %10 + %190 = tail call float @llvm.nvvm.rsqrt.approx.f(float %185), !dbg !58 + br label %__nv_rsqrtf.exit, !dbg !58 + +__nv_rsqrtf.exit: ; preds = %187, %189 + %.0.i = phi float [ %188, %187 ], [ %190, %189 ], !dbg !58 + %191 = extractvalue { i32, i32, i32, i32 } %80, 3, !dbg !29 + %192 = bitcast i32 %191 to float, !dbg !29 + %193 = extractvalue { i32, i32, i32, i32 } %80, 2, !dbg !29 + %194 = bitcast i32 %193 to float, !dbg !29 + %195 = extractvalue { i32, i32, i32, i32 } %80, 1, !dbg !29 + %196 = bitcast i32 %195 to float, !dbg !29 + %197 = extractvalue { i32, i32, i32, i32 } %80, 0, !dbg !29 + %198 = bitcast i32 %197 to float, !dbg !29 + %199 = fmul float %146, %.0.i, !dbg !59 + %200 = fmul float %147, %.0.i, !dbg !59 + %201 = fmul float %148, %.0.i, !dbg !59 + %202 = fmul float %149, %.0.i, !dbg !59 + %203 = fmul float %199, %198, !dbg !60 + %204 = fmul float %200, %196, !dbg !60 + %205 = fmul float %201, %194, !dbg !60 + %206 = fmul float %202, %192, !dbg !60 + %207 = getelementptr float, ptr addrspace(1) %6, i64 %19, !dbg !61 + %208 = bitcast float %145 to i32, !dbg !62 + %209 = bitcast float %89 to i32, !dbg !62 + %210 = bitcast float %90 to i32, !dbg !62 + %211 = bitcast float %106 to i32, !dbg !62 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %207, i1 true) #6, !dbg !62 + %212 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !63 + %213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %203) #6, !dbg !64 + %214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #6, !dbg !64 + %215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #6, !dbg !64 + %216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #6, !dbg !64 + %217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !64 + %218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !64 + %219 = bitcast <2 x i16> %218 to i32, !dbg !64 + %220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !64 + %221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !64 + %222 = bitcast <2 x i16> %221 to i32, !dbg !64 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #6, !dbg !64 + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { + %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 + %.not = icmp eq i32 %1, 0 + br i1 %.not, label %4, label %2 + +2: ; preds = %0 + %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) + br label %6 + +4: ; preds = %0 + %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) + br label %6 + +6: ; preds = %4, %2 + %.0 = phi float [ %3, %2 ], [ %5, %4 ] + ret float %.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #5 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "cybxnh26qvsbmxmvdr54vaav2ezk2qxu7562fhhsn4lvyvqgoglw.py", directory: "/tmp/torchinductor_root/yb") +!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 26, column: 26, scope: !7) +!11 = !DILocation(line: 23, column: 28, scope: !7) +!12 = !DILocation(line: 30, column: 40, scope: !7) +!13 = !DILocation(line: 30, column: 36, scope: !7) +!14 = !DILocation(line: 30, column: 30, scope: !7) +!15 = !DILocation(line: 30, column: 46, scope: !7) +!16 = !DILocation(line: 31, column: 30, scope: !7) +!17 = !DILocation(line: 31, column: 46, scope: !7) +!18 = !DILocation(line: 31, column: 67, scope: !7) +!19 = !DILocation(line: 32, column: 30, scope: !7) +!20 = !DILocation(line: 32, column: 46, scope: !7) +!21 = !DILocation(line: 32, column: 67, scope: !7) +!22 = !DILocation(line: 33, column: 30, scope: !7) +!23 = !DILocation(line: 33, column: 46, scope: !7) +!24 = !DILocation(line: 33, column: 67, scope: !7) +!25 = !DILocation(line: 34, column: 31, scope: !7) +!26 = !DILocation(line: 34, column: 47, scope: !7) +!27 = !DILocation(line: 34, column: 68, scope: !7) +!28 = !DILocation(line: 35, column: 31, scope: !7) +!29 = !DILocation(line: 35, column: 36, scope: !7) +!30 = !DILocation(line: 37, column: 18, scope: !7) +!31 = !DILocation(line: 39, column: 18, scope: !7) +!32 = !DILocation(line: 41, column: 18, scope: !7) +!33 = !DILocation(line: 43, column: 19, scope: !7) +!34 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !38) +!35 = distinct !DILexicalBlockFile(scope: !37, file: !36, discriminator: 0) +!36 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!37 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0) +!38 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !39) +!39 = !DILocation(line: 48, column: 59, scope: !35) +!40 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !41) +!41 = !DILocation(line: 48, column: 59, scope: !37) +!42 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !45) +!43 = distinct !DILexicalBlockFile(scope: !7, file: !44, discriminator: 0) +!44 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!45 = !DILocation(line: 48, column: 45, scope: !43) +!46 = !DILocation(line: 51, column: 20, scope: !7) +!47 = !DILocation(line: 52, column: 20, scope: !7) +!48 = !DILocation(line: 53, column: 20, scope: !7) +!49 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !50) +!50 = !DILocation(line: 56, column: 59, scope: !37) +!51 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !52) +!52 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !53) +!53 = !DILocation(line: 56, column: 59, scope: !35) +!54 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !55) +!55 = !DILocation(line: 56, column: 45, scope: !43) +!56 = !DILocation(line: 59, column: 20, scope: !7) +!57 = !DILocation(line: 61, column: 20, scope: !7) +!58 = !DILocation(line: 62, column: 26, scope: !7) +!59 = !DILocation(line: 63, column: 20, scope: !7) +!60 = !DILocation(line: 64, column: 20, scope: !7) +!61 = !DILocation(line: 66, column: 25, scope: !7) +!62 = !DILocation(line: 66, column: 48, scope: !7) +!63 = !DILocation(line: 67, column: 25, scope: !7) +!64 = !DILocation(line: 67, column: 48, scope: !7) +!65 = !DILocation(line: 67, column: 4, scope: !7) diff --git a/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..623158aaf6ad1fd6d7e07515ab88800d0975f7e6 --- /dev/null +++ b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx @@ -0,0 +1,807 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7d8de9de +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5d6d7d8de9de( + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7, + .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8, + .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<33>; + .reg .b16 %rs<21>; + .reg .b32 %r<112>; + .reg .f32 %f<94>; + .reg .b64 %rd<20>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6d7d8de9de_param_0]; + ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6d7d8de9de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r78, %tid.x; + and.b32 %r79, %r78, 31; + ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6d7d8de9de_param_2]; + ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6d7d8de9de_param_3]; + ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8de9de_param_4]; + shl.b32 %r80, %r78, 2; + ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8de9de_param_5]; + and.b32 %r81, %r80, 252; + ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8de9de_param_6]; + ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_7]; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r82, %r1, 8; + .loc 1 30 36 + or.b32 %r83, %r82, %r81; + .loc 1 30 30 + mul.wide.s32 %rd17, %r83, 4; + add.s64 %rd1, %rd9, %rd17; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.b32 %f1, %r2; + mov.b32 %f2, %r3; + mov.b32 %f3, %r4; + mov.b32 %f4, %r5; + .loc 1 31 30 + mul.wide.s32 %rd18, %r83, 2; + add.s64 %rd2, %rd10, %rd18; + .loc 1 31 46 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r6; + @!%p1 mov.u32 %r11, %r6; + cvt.u16.u32 %rs1, %r10; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; } + cvt.u16.u32 %rs3, %r11; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; } + .loc 1 31 67 + cvt.f32.bf16 %r14, %rs1; + mov.b32 %f5, %r14; + cvt.f32.bf16 %r15, %rs2; + mov.b32 %f6, %r15; + cvt.f32.bf16 %r16, %rs3; + mov.b32 %f7, %r16; + cvt.f32.bf16 %r17, %rs4; + mov.b32 %f8, %r17; + .loc 1 32 30 + add.s64 %rd3, %rd11, %rd18; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r6; + @!%p1 mov.u32 %r19, %r6; + cvt.u16.u32 %rs5, %r18; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; } + cvt.u16.u32 %rs7, %r19; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; } + .loc 1 32 67 + cvt.f32.bf16 %r22, %rs5; + mov.b32 %f9, %r22; + cvt.f32.bf16 %r23, %rs6; + mov.b32 %f10, %r23; + cvt.f32.bf16 %r24, %rs7; + mov.b32 %f11, %r24; + cvt.f32.bf16 %r25, %rs8; + mov.b32 %f12, %r25; + .loc 1 33 30 + add.s64 %rd4, %rd12, %rd18; + .loc 1 33 46 + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + @%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r26, %r6; + @!%p1 mov.u32 %r27, %r6; + cvt.u16.u32 %rs9, %r26; + { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; } + cvt.u16.u32 %rs11, %r27; + { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; } + .loc 1 33 67 + cvt.f32.bf16 %r30, %rs9; + mov.b32 %f13, %r30; + cvt.f32.bf16 %r31, %rs10; + mov.b32 %f14, %r31; + cvt.f32.bf16 %r32, %rs11; + mov.b32 %f15, %r32; + cvt.f32.bf16 %r33, %rs12; + mov.b32 %f16, %r33; + .loc 1 34 31 + add.s64 %rd5, %rd13, %rd18; + .loc 1 34 47 + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + @%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ]; + @!%p1 mov.u32 %r34, %r6; + @!%p1 mov.u32 %r35, %r6; + cvt.u16.u32 %rs13, %r34; + { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; } + cvt.u16.u32 %rs15, %r35; + { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; } + .loc 1 34 68 + cvt.f32.bf16 %r38, %rs13; + mov.b32 %f17, %r38; + cvt.f32.bf16 %r39, %rs14; + mov.b32 %f18, %r39; + cvt.f32.bf16 %r40, %rs15; + mov.b32 %f19, %r40; + cvt.f32.bf16 %r41, %rs16; + mov.b32 %f20, %r41; + .loc 1 35 31 + mul.wide.u32 %rd19, %r81, 4; + add.s64 %rd6, %rd14, %rd19; + .loc 1 35 36 + mov.u32 %r42, 0x0; + mov.u32 %r43, 0x0; + mov.u32 %r44, 0x0; + mov.u32 %r45, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ]; + @!%p1 mov.u32 %r42, %r6; + @!%p1 mov.u32 %r43, %r6; + @!%p1 mov.u32 %r44, %r6; + @!%p1 mov.u32 %r45, %r6; + .loc 1 37 18 + add.f32 %f21, %f5, %f1; + add.f32 %f22, %f6, %f2; + add.f32 %f23, %f7, %f3; + .loc 1 39 18 + add.f32 %f24, %f21, %f9; + add.f32 %f25, %f22, %f10; + add.f32 %f26, %f23, %f11; + .loc 1 41 18 + add.f32 %f27, %f25, %f14; + add.f32 %f28, %f26, %f15; + .loc 1 43 19 + add.f32 %f29, %f27, %f18; + add.f32 %f30, %f28, %f19; + .loc 1 41 18 + add.f32 %f31, %f24, %f13; + add.f32 %f32, %f8, %f4; + .loc 1 43 19 + add.f32 %f33, %f32, %f12; + add.f32 %f34, %f31, %f17; +$L__tmp1: + .loc 2 233 15 + add.f32 %f35, %f34, %f29; + add.f32 %f36, %f33, %f16; + add.f32 %f37, %f35, %f30; + add.f32 %f38, %f36, %f20; + mov.b32 %r71, %f38; + add.f32 %f39, %f37, %f38; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r84, %f39; + shfl.sync.bfly.b32 %r85, %r84, 16, 31, -1; + mov.b32 %f40, %r85; +$L__tmp3: + .loc 2 233 15 + add.f32 %f41, %f39, %f40; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r86, %f41; + shfl.sync.bfly.b32 %r87, %r86, 8, 31, -1; + mov.b32 %f42, %r87; +$L__tmp5: + .loc 2 233 15 + add.f32 %f43, %f41, %f42; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r88, %f43; + shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1; + mov.b32 %f44, %r89; +$L__tmp7: + .loc 2 233 15 + add.f32 %f45, %f43, %f44; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r90, %f45; + shfl.sync.bfly.b32 %r91, %r90, 2, 31, -1; + mov.b32 %f46, %r91; +$L__tmp9: + .loc 2 233 15 + add.f32 %f47, %f45, %f46; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r92, %f47; + shfl.sync.bfly.b32 %r93, %r92, 1, 31, -1; + mov.b32 %f48, %r93; +$L__tmp11: + .loc 2 233 15 + add.f32 %f49, %f47, %f48; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p23, %r79, 0; + shr.u32 %r94, %r78, 3; + and.b32 %r95, %r94, 4; + mov.u32 %r96, global_smem; + add.s32 %r50, %r96, %r95; + mov.b32 %r51, %f49; + @%p23 st.shared.b32 [ %r50 + 0 ], %r51; + bar.sync 0; + setp.lt.s32 %p24, %r78, 2; + add.s32 %r53, %r96, %r80; + @%p24 ld.shared.b32 %r52, [ %r53 + 0 ]; + mov.b32 %f50, %r52; + shfl.sync.bfly.b32 %r97, %r52, 1, 31, -1; + mov.b32 %f51, %r97; +$L__tmp13: + .loc 2 233 15 + add.f32 %f52, %f50, %f51; +$L__tmp14: + .loc 2 243 36 + and.b32 %r98, %r78, 1; + setp.eq.b32 %p31, %r98, 1; + not.pred %p32, %p31; + and.pred %p25, %p24, %p32; + mov.b32 %r55, %f52; + @%p25 st.shared.b32 [ %r53 + 0 ], %r55; + bar.sync 0; + ld.shared.f32 %f53, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f54, %f53, 0f00000000; +$L__tmp16: + .loc 1 51 20 + mov.b32 %r57, %f54; + mov.b32 %r58, 1132462080; + div.full.f32 %r56, %r57, %r58; + mov.b32 %f55, %r56; + .loc 1 52 20 + sub.f32 %f56, %f34, %f55; + sub.f32 %f57, %f29, %f55; + sub.f32 %f58, %f30, %f55; + sub.f32 %f59, %f38, %f55; + .loc 1 53 20 + mul.f32 %f60, %f57, %f57; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f61, %f56, %f56, %f60; + fma.rn.f32 %f62, %f58, %f58, %f61; + fma.rn.f32 %f63, %f59, %f59, %f62; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r99, %f63; + shfl.sync.bfly.b32 %r100, %r99, 16, 31, -1; + mov.b32 %f64, %r100; +$L__tmp20: + .loc 2 233 15 + add.f32 %f65, %f63, %f64; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r101, %f65; + shfl.sync.bfly.b32 %r102, %r101, 8, 31, -1; + mov.b32 %f66, %r102; +$L__tmp22: + .loc 2 233 15 + add.f32 %f67, %f65, %f66; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r103, %f67; + shfl.sync.bfly.b32 %r104, %r103, 4, 31, -1; + mov.b32 %f68, %r104; +$L__tmp24: + .loc 2 233 15 + add.f32 %f69, %f67, %f68; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r105, %f69; + shfl.sync.bfly.b32 %r106, %r105, 2, 31, -1; + mov.b32 %f70, %r106; +$L__tmp26: + .loc 2 233 15 + add.f32 %f71, %f69, %f70; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r107, %f71; + shfl.sync.bfly.b32 %r108, %r107, 1, 31, -1; + mov.b32 %f72, %r108; +$L__tmp28: + .loc 2 233 15 + add.f32 %f73, %f71, %f72; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r60, %f73; + @%p23 st.shared.b32 [ %r50 + 0 ], %r60; + bar.sync 0; + @%p24 ld.shared.b32 %r61, [ %r53 + 0 ]; + mov.b32 %f74, %r61; + shfl.sync.bfly.b32 %r109, %r61, 1, 31, -1; + mov.b32 %f75, %r109; +$L__tmp30: + .loc 2 233 15 + add.f32 %f76, %f74, %f75; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r64, %f76; + @%p25 st.shared.b32 [ %r53 + 0 ], %r64; + bar.sync 0; + ld.shared.f32 %f77, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f78, %f77, 0f00000000; +$L__tmp33: + .loc 1 59 20 + mov.b32 %r66, %f78; + div.full.f32 %r65, %r66, %r58; + mov.b32 %f79, %r65; + .loc 1 61 20 + add.f32 %f80, %f79, 0f3727C5AC; + .loc 1 62 26 + rsqrt.approx.ftz.f32 %f81, %f80; + .loc 1 35 36 + mov.b32 %f82, %r45; + mov.b32 %f83, %r44; + mov.b32 %f84, %r43; + mov.b32 %f85, %r42; + .loc 1 63 20 + mul.f32 %f86, %f56, %f81; + mul.f32 %f87, %f57, %f81; + mul.f32 %f88, %f58, %f81; + mul.f32 %f89, %f59, %f81; + .loc 1 64 20 + mul.f32 %f90, %f86, %f85; + mul.f32 %f91, %f87, %f84; + mul.f32 %f92, %f88, %f83; + mul.f32 %f93, %f89, %f82; + .loc 1 66 25 + add.s64 %rd7, %rd15, %rd17; + .loc 1 66 48 + mov.b32 %r68, %f34; + mov.b32 %r69, %f29; + mov.b32 %r70, %f30; + @%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 }; + .loc 1 67 25 + add.s64 %rd8, %rd16, %rd18; + .loc 1 67 48 + mov.b32 %r72, %f90; + cvt.rn.bf16.f32 %rs17, %r72; + mov.b32 %r73, %f91; + cvt.rn.bf16.f32 %rs18, %r73; + mov.b32 %r74, %f92; + cvt.rn.bf16.f32 %rs19, %r74; + mov.b32 %r75, %f93; + cvt.rn.bf16.f32 %rs20, %r75; + mov.b32 %r110, {%rs17, %rs18}; + mov.b32 %r111, {%rs19, %rs20}; + @%p1 st.global.v2.b32 [ %rd8 + 0 ], { %r110, %r111 }; + .loc 1 67 4 + ret; +$L__tmp34: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/yb/cybxnh26qvsbmxmvdr54vaav2ezk2qxu7562fhhsn4lvyvqgoglw.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 407 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 121 +.b8 98 +.b8 120 +.b8 110 +.b8 104 +.b8 50 +.b8 54 +.b8 113 +.b8 118 +.b8 115 +.b8 98 +.b8 109 +.b8 120 +.b8 109 +.b8 118 +.b8 100 +.b8 114 +.b8 53 +.b8 52 +.b8 118 +.b8 97 +.b8 97 +.b8 118 +.b8 50 +.b8 101 +.b8 122 +.b8 107 +.b8 50 +.b8 113 +.b8 120 +.b8 117 +.b8 55 +.b8 53 +.b8 54 +.b8 50 +.b8 102 +.b8 104 +.b8 104 +.b8 115 +.b8 110 +.b8 52 +.b8 108 +.b8 118 +.b8 121 +.b8 118 +.b8 113 +.b8 103 +.b8 111 +.b8 103 +.b8 108 +.b8 119 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 121 +.b8 98 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 48 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 48 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 48 +.b8 45 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 56 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 56 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 56 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 411 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 411 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5d8f556a26051cd68c5a9cbc11f33a2d1ce6eeb5 --- /dev/null +++ b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir @@ -0,0 +1,76 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant 9.99999974E-6 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %17 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %21 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %25 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %28 = arith.addf %8, %12 : tensor<256xf32, #blocked> + %29 = arith.addf %28, %16 : tensor<256xf32, #blocked> + %30 = arith.addf %29, %20 : tensor<256xf32, #blocked> + %31 = arith.addf %30, %24 : tensor<256xf32, #blocked> + %32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %53 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %53 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %34 = arith.addf %33, %cst_2 : f32 + %35 = arith.divf %34, %cst_1 : f32 + %36 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked> + %37 = arith.subf %31, %36 : tensor<256xf32, #blocked> + %38 = arith.mulf %37, %37 : tensor<256xf32, #blocked> + %39 = arith.select %2, %38, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %53 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %53 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %41 = arith.addf %40, %cst_2 : f32 + %42 = arith.divf %41, %cst_1 : f32 + %43 = arith.addf %42, %cst_0 : f32 + %44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %45 = tt.splat %44 : (f32) -> tensor<256xf32, #blocked> + %46 = arith.mulf %37, %45 : tensor<256xf32, #blocked> + %47 = arith.mulf %46, %27 : tensor<256xf32, #blocked> + %48 = tt.splat %arg6 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %49 = tt.addptr %48, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %50 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %52 = arith.truncf %47 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a22267fb52fc938b1ffd5794663968a09ee5a8e6 --- /dev/null +++ b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir @@ -0,0 +1,57 @@ +module { + tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 9.99999974E-6 : f32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_4 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %16 = arith.addf %8, %12 : tensor<256xf32> + %17 = arith.select %2, %16, %cst_3 : tensor<256xi1>, tensor<256xf32> + %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({ + ^bb0(%arg6: f32, %arg7: f32): + %36 = arith.addf %arg6, %arg7 : f32 + tt.reduce.return %36 : f32 + }) : (tensor<256xf32>) -> f32 + %19 = arith.addf %18, %cst_0 : f32 + %20 = arith.divf %19, %cst_1 : f32 + %21 = tt.splat %20 : (f32) -> tensor<256xf32> + %22 = arith.subf %16, %21 : tensor<256xf32> + %23 = arith.mulf %22, %22 : tensor<256xf32> + %24 = arith.select %2, %23, %cst_3 : tensor<256xi1>, tensor<256xf32> + %25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({ + ^bb0(%arg6: f32, %arg7: f32): + %36 = arith.addf %arg6, %arg7 : f32 + tt.reduce.return %36 : f32 + }) : (tensor<256xf32>) -> f32 + %26 = arith.addf %25, %cst_0 : f32 + %27 = arith.divf %26, %cst_1 : f32 + %28 = arith.addf %27, %cst_2 : f32 + %29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %30 = tt.splat %29 : (f32) -> tensor<256xf32> + %31 = arith.mulf %22, %30 : tensor<256xf32> + %32 = arith.mulf %31, %15 : tensor<256xf32> + %33 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %35 = arith.truncf %32 : tensor<256xf32> to tensor<256xbf16> + tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16> + tt.return + } +} diff --git a/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..9678d604de6931a3aabff53458ec936a78b40315 Binary files /dev/null and b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin differ diff --git a/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..54590517dffaf74b88829ef8ed110936524364f1 --- /dev/null +++ b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx @@ -0,0 +1,764 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1de +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1de( + .param .u64 triton__0d1de_param_0, + .param .u32 triton__0d1de_param_1 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<27>; + .reg .b16 %rs<17>; + .reg .b32 %r<67>; + .reg .f32 %f<431>; + .reg .b64 %rd<6>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1de_param_0]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r14, %tid.x; + shl.b32 %r15, %r14, 3; + and.b32 %r16, %r15, 1016; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r17, %r1, 10; + .loc 1 21 23 + or.b32 %r18, %r17, %r16; + .loc 1 24 34 + mul.wide.s32 %rd4, %r18, 2; + add.s64 %rd5, %rd3, %rd4; + mov.pred %p1, -1; + .loc 1 24 39 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd5 + 0 ]; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + .loc 1 24 48 + cvt.f32.bf16 %r6, %rs1; + mov.b32 %f1, %r6; + cvt.f32.bf16 %r7, %rs2; + mov.b32 %f2, %r7; + .loc 1 29 18 + mul.f32 %f9, %f1, 0f3F3504F3; + .loc 1 30 23 + abs.ftz.f32 %f17, %f9; + setp.ge.f32 %p2, %f17, 0f3F8060FE; + mov.f32 %f365, 0f3789CA3C; + mov.f32 %f364, 0fB9F560B9; + mov.f32 %f363, 0f3BAC840B; + mov.f32 %f362, 0fBD0C8162; + mov.f32 %f361, 0f3E1CF906; + mov.f32 %f360, 0f3F6A937E; + mov.f32 %f359, 0f3F20D842; + mov.f32 %f366, %f17; + @%p2 bra $L__BB0_2; + .loc 1 0 23 + mov.f32 %f365, 0f38B1E96A; + mov.f32 %f364, 0fBA574D20; + mov.f32 %f363, 0f3BAAD5EA; + mov.f32 %f362, 0fBCDC1BE7; + mov.f32 %f361, 0f3DE718AF; + mov.f32 %f360, 0fBEC093AC; + mov.f32 %f359, 0f3E0375D3; + .loc 1 30 23 + mul.f32 %f366, %f9, %f9; +$L__BB0_2: + .loc 1 0 0 + cvt.f32.bf16 %r8, %rs3; + mul.f32 %f10, %f2, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p3, %f17, 0f3F8060FE; + fma.rn.ftz.f32 %f135, %f365, %f366, %f364; + fma.rn.ftz.f32 %f136, %f135, %f366, %f363; + fma.rn.ftz.f32 %f137, %f136, %f366, %f362; + fma.rn.ftz.f32 %f138, %f137, %f366, %f361; + fma.rn.ftz.f32 %f139, %f138, %f366, %f360; + fma.rn.ftz.f32 %f140, %f139, %f366, %f359; + neg.f32 %f141, %f366; + selp.f32 %f142, %f141, %f9, %p2; + fma.rn.ftz.f32 %f367, %f140, %f142, %f142; + mov.f32 %f358, 0f3F800000; + @%p3 bra $L__BB0_4; + ex2.approx.ftz.f32 %f143, %f367; + sub.f32 %f145, %f358, %f143; + mov.b32 %r19, %f145; + mov.b32 %r20, %f9; + and.b32 %r21, %r20, -2147483648; + or.b32 %r22, %r21, %r19; + mov.b32 %f367, %r22; +$L__BB0_4: + .loc 1 0 0 + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + mov.b32 %f3, %r8; + .loc 1 30 23 + abs.ftz.f32 %f30, %f10; + setp.ge.f32 %p5, %f30, 0f3F8060FE; + mov.f32 %f374, 0f3789CA3C; + mov.f32 %f373, 0fB9F560B9; + mov.f32 %f372, 0f3BAC840B; + mov.f32 %f371, 0fBD0C8162; + mov.f32 %f370, 0f3E1CF906; + mov.f32 %f369, 0f3F6A937E; + mov.f32 %f368, 0f3F20D842; + mov.f32 %f375, %f30; + @%p5 bra $L__BB0_6; + mul.f32 %f375, %f10, %f10; + mov.f32 %f374, 0f38B1E96A; + mov.f32 %f373, 0fBA574D20; + mov.f32 %f372, 0f3BAAD5EA; + mov.f32 %f371, 0fBCDC1BE7; + mov.f32 %f370, 0f3DE718AF; + mov.f32 %f369, 0fBEC093AC; + mov.f32 %f368, 0f3E0375D3; +$L__BB0_6: + .loc 1 0 0 + cvt.f32.bf16 %r9, %rs4; + mul.f32 %f11, %f3, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p6, %f30, 0f3F8060FE; + fma.rn.ftz.f32 %f160, %f374, %f375, %f373; + fma.rn.ftz.f32 %f161, %f160, %f375, %f372; + fma.rn.ftz.f32 %f162, %f161, %f375, %f371; + fma.rn.ftz.f32 %f163, %f162, %f375, %f370; + fma.rn.ftz.f32 %f164, %f163, %f375, %f369; + fma.rn.ftz.f32 %f165, %f164, %f375, %f368; + neg.f32 %f166, %f375; + selp.f32 %f167, %f166, %f10, %p5; + fma.rn.ftz.f32 %f376, %f165, %f167, %f167; + @%p6 bra $L__BB0_8; + ex2.approx.ftz.f32 %f168, %f376; + sub.f32 %f170, %f358, %f168; + mov.b32 %r23, %f170; + mov.b32 %r24, %f10; + and.b32 %r25, %r24, -2147483648; + or.b32 %r26, %r25, %r23; + mov.b32 %f376, %r26; +$L__BB0_8: + .loc 1 0 0 + cvt.u16.u32 %rs5, %r4; + mov.b32 %f4, %r9; + .loc 1 30 23 + abs.ftz.f32 %f43, %f11; + setp.ge.f32 %p8, %f43, 0f3F8060FE; + mov.f32 %f383, 0f3789CA3C; + mov.f32 %f382, 0fB9F560B9; + mov.f32 %f381, 0f3BAC840B; + mov.f32 %f380, 0fBD0C8162; + mov.f32 %f379, 0f3E1CF906; + mov.f32 %f378, 0f3F6A937E; + mov.f32 %f377, 0f3F20D842; + mov.f32 %f384, %f43; + @%p8 bra $L__BB0_10; + mul.f32 %f384, %f11, %f11; + mov.f32 %f383, 0f38B1E96A; + mov.f32 %f382, 0fBA574D20; + mov.f32 %f381, 0f3BAAD5EA; + mov.f32 %f380, 0fBCDC1BE7; + mov.f32 %f379, 0f3DE718AF; + mov.f32 %f378, 0fBEC093AC; + mov.f32 %f377, 0f3E0375D3; +$L__BB0_10: + .loc 1 0 0 + cvt.f32.bf16 %r10, %rs5; + mul.f32 %f12, %f4, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p9, %f43, 0f3F8060FE; + fma.rn.ftz.f32 %f185, %f383, %f384, %f382; + fma.rn.ftz.f32 %f186, %f185, %f384, %f381; + fma.rn.ftz.f32 %f187, %f186, %f384, %f380; + fma.rn.ftz.f32 %f188, %f187, %f384, %f379; + fma.rn.ftz.f32 %f189, %f188, %f384, %f378; + fma.rn.ftz.f32 %f190, %f189, %f384, %f377; + neg.f32 %f191, %f384; + selp.f32 %f192, %f191, %f11, %p8; + fma.rn.ftz.f32 %f385, %f190, %f192, %f192; + @%p9 bra $L__BB0_12; + ex2.approx.ftz.f32 %f193, %f385; + sub.f32 %f195, %f358, %f193; + mov.b32 %r27, %f195; + mov.b32 %r28, %f11; + and.b32 %r29, %r28, -2147483648; + or.b32 %r30, %r29, %r27; + mov.b32 %f385, %r30; +$L__BB0_12: + .loc 1 0 0 + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; } + mov.b32 %f5, %r10; + .loc 1 30 23 + abs.ftz.f32 %f56, %f12; + setp.ge.f32 %p11, %f56, 0f3F8060FE; + mov.f32 %f392, 0f3789CA3C; + mov.f32 %f391, 0fB9F560B9; + mov.f32 %f390, 0f3BAC840B; + mov.f32 %f389, 0fBD0C8162; + mov.f32 %f388, 0f3E1CF906; + mov.f32 %f387, 0f3F6A937E; + mov.f32 %f386, 0f3F20D842; + mov.f32 %f393, %f56; + @%p11 bra $L__BB0_14; + mul.f32 %f393, %f12, %f12; + mov.f32 %f392, 0f38B1E96A; + mov.f32 %f391, 0fBA574D20; + mov.f32 %f390, 0f3BAAD5EA; + mov.f32 %f389, 0fBCDC1BE7; + mov.f32 %f388, 0f3DE718AF; + mov.f32 %f387, 0fBEC093AC; + mov.f32 %f386, 0f3E0375D3; +$L__BB0_14: + .loc 1 0 0 + cvt.f32.bf16 %r11, %rs6; + mul.f32 %f13, %f5, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p12, %f56, 0f3F8060FE; + fma.rn.ftz.f32 %f210, %f392, %f393, %f391; + fma.rn.ftz.f32 %f211, %f210, %f393, %f390; + fma.rn.ftz.f32 %f212, %f211, %f393, %f389; + fma.rn.ftz.f32 %f213, %f212, %f393, %f388; + fma.rn.ftz.f32 %f214, %f213, %f393, %f387; + fma.rn.ftz.f32 %f215, %f214, %f393, %f386; + neg.f32 %f216, %f393; + selp.f32 %f217, %f216, %f12, %p11; + fma.rn.ftz.f32 %f394, %f215, %f217, %f217; + @%p12 bra $L__BB0_16; + ex2.approx.ftz.f32 %f218, %f394; + sub.f32 %f220, %f358, %f218; + mov.b32 %r31, %f220; + mov.b32 %r32, %f12; + and.b32 %r33, %r32, -2147483648; + or.b32 %r34, %r33, %r31; + mov.b32 %f394, %r34; +$L__BB0_16: + .loc 1 0 0 + cvt.u16.u32 %rs7, %r5; + mov.b32 %f6, %r11; + .loc 1 30 23 + abs.ftz.f32 %f69, %f13; + setp.ge.f32 %p14, %f69, 0f3F8060FE; + mov.f32 %f401, 0f3789CA3C; + mov.f32 %f400, 0fB9F560B9; + mov.f32 %f399, 0f3BAC840B; + mov.f32 %f398, 0fBD0C8162; + mov.f32 %f397, 0f3E1CF906; + mov.f32 %f396, 0f3F6A937E; + mov.f32 %f395, 0f3F20D842; + mov.f32 %f402, %f69; + @%p14 bra $L__BB0_18; + mul.f32 %f402, %f13, %f13; + mov.f32 %f401, 0f38B1E96A; + mov.f32 %f400, 0fBA574D20; + mov.f32 %f399, 0f3BAAD5EA; + mov.f32 %f398, 0fBCDC1BE7; + mov.f32 %f397, 0f3DE718AF; + mov.f32 %f396, 0fBEC093AC; + mov.f32 %f395, 0f3E0375D3; +$L__BB0_18: + .loc 1 0 0 + cvt.f32.bf16 %r12, %rs7; + mul.f32 %f14, %f6, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p15, %f69, 0f3F8060FE; + fma.rn.ftz.f32 %f235, %f401, %f402, %f400; + fma.rn.ftz.f32 %f236, %f235, %f402, %f399; + fma.rn.ftz.f32 %f237, %f236, %f402, %f398; + fma.rn.ftz.f32 %f238, %f237, %f402, %f397; + fma.rn.ftz.f32 %f239, %f238, %f402, %f396; + fma.rn.ftz.f32 %f240, %f239, %f402, %f395; + neg.f32 %f241, %f402; + selp.f32 %f242, %f241, %f13, %p14; + fma.rn.ftz.f32 %f403, %f240, %f242, %f242; + @%p15 bra $L__BB0_20; + ex2.approx.ftz.f32 %f243, %f403; + sub.f32 %f245, %f358, %f243; + mov.b32 %r35, %f245; + mov.b32 %r36, %f13; + and.b32 %r37, %r36, -2147483648; + or.b32 %r38, %r37, %r35; + mov.b32 %f403, %r38; +$L__BB0_20: + .loc 1 0 0 + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; } + mov.b32 %f7, %r12; + .loc 1 30 23 + abs.ftz.f32 %f82, %f14; + setp.ge.f32 %p17, %f82, 0f3F8060FE; + mov.f32 %f410, 0f3789CA3C; + mov.f32 %f409, 0fB9F560B9; + mov.f32 %f408, 0f3BAC840B; + mov.f32 %f407, 0fBD0C8162; + mov.f32 %f406, 0f3E1CF906; + mov.f32 %f405, 0f3F6A937E; + mov.f32 %f404, 0f3F20D842; + mov.f32 %f411, %f82; + @%p17 bra $L__BB0_22; + mul.f32 %f411, %f14, %f14; + mov.f32 %f410, 0f38B1E96A; + mov.f32 %f409, 0fBA574D20; + mov.f32 %f408, 0f3BAAD5EA; + mov.f32 %f407, 0fBCDC1BE7; + mov.f32 %f406, 0f3DE718AF; + mov.f32 %f405, 0fBEC093AC; + mov.f32 %f404, 0f3E0375D3; +$L__BB0_22: + .loc 1 0 0 + cvt.f32.bf16 %r13, %rs8; + mul.f32 %f15, %f7, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p18, %f82, 0f3F8060FE; + fma.rn.ftz.f32 %f260, %f410, %f411, %f409; + fma.rn.ftz.f32 %f261, %f260, %f411, %f408; + fma.rn.ftz.f32 %f262, %f261, %f411, %f407; + fma.rn.ftz.f32 %f263, %f262, %f411, %f406; + fma.rn.ftz.f32 %f264, %f263, %f411, %f405; + fma.rn.ftz.f32 %f265, %f264, %f411, %f404; + neg.f32 %f266, %f411; + selp.f32 %f267, %f266, %f14, %p17; + fma.rn.ftz.f32 %f412, %f265, %f267, %f267; + @%p18 bra $L__BB0_24; + ex2.approx.ftz.f32 %f268, %f412; + sub.f32 %f270, %f358, %f268; + mov.b32 %r39, %f270; + mov.b32 %r40, %f14; + and.b32 %r41, %r40, -2147483648; + or.b32 %r42, %r41, %r39; + mov.b32 %f412, %r42; +$L__BB0_24: + .loc 1 0 0 + mov.b32 %f8, %r13; + .loc 1 30 23 + abs.ftz.f32 %f95, %f15; + setp.ge.f32 %p20, %f95, 0f3F8060FE; + mov.f32 %f419, 0f3789CA3C; + mov.f32 %f418, 0fB9F560B9; + mov.f32 %f417, 0f3BAC840B; + mov.f32 %f416, 0fBD0C8162; + mov.f32 %f415, 0f3E1CF906; + mov.f32 %f414, 0f3F6A937E; + mov.f32 %f413, 0f3F20D842; + mov.f32 %f420, %f95; + @%p20 bra $L__BB0_26; + mul.f32 %f420, %f15, %f15; + mov.f32 %f419, 0f38B1E96A; + mov.f32 %f418, 0fBA574D20; + mov.f32 %f417, 0f3BAAD5EA; + mov.f32 %f416, 0fBCDC1BE7; + mov.f32 %f415, 0f3DE718AF; + mov.f32 %f414, 0fBEC093AC; + mov.f32 %f413, 0f3E0375D3; +$L__BB0_26: + .loc 1 0 0 + mul.f32 %f16, %f8, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p21, %f95, 0f3F8060FE; + fma.rn.ftz.f32 %f285, %f419, %f420, %f418; + fma.rn.ftz.f32 %f286, %f285, %f420, %f417; + fma.rn.ftz.f32 %f287, %f286, %f420, %f416; + fma.rn.ftz.f32 %f288, %f287, %f420, %f415; + fma.rn.ftz.f32 %f289, %f288, %f420, %f414; + fma.rn.ftz.f32 %f290, %f289, %f420, %f413; + neg.f32 %f291, %f420; + selp.f32 %f292, %f291, %f15, %p20; + fma.rn.ftz.f32 %f421, %f290, %f292, %f292; + @%p21 bra $L__BB0_28; + ex2.approx.ftz.f32 %f293, %f421; + sub.f32 %f295, %f358, %f293; + mov.b32 %r43, %f295; + mov.b32 %r44, %f15; + and.b32 %r45, %r44, -2147483648; + or.b32 %r46, %r45, %r43; + mov.b32 %f421, %r46; +$L__BB0_28: + abs.ftz.f32 %f108, %f16; + setp.ge.f32 %p23, %f108, 0f3F8060FE; + mov.f32 %f428, 0f3789CA3C; + mov.f32 %f427, 0fB9F560B9; + mov.f32 %f426, 0f3BAC840B; + mov.f32 %f425, 0fBD0C8162; + mov.f32 %f424, 0f3E1CF906; + mov.f32 %f423, 0f3F6A937E; + mov.f32 %f422, 0f3F20D842; + mov.f32 %f429, %f108; + @%p23 bra $L__BB0_30; + mul.f32 %f429, %f16, %f16; + mov.f32 %f428, 0f38B1E96A; + mov.f32 %f427, 0fBA574D20; + mov.f32 %f426, 0f3BAAD5EA; + mov.f32 %f425, 0fBCDC1BE7; + mov.f32 %f424, 0f3DE718AF; + mov.f32 %f423, 0fBEC093AC; + mov.f32 %f422, 0f3E0375D3; +$L__BB0_30: + setp.ltu.f32 %p24, %f108, 0f3F8060FE; + fma.rn.ftz.f32 %f310, %f428, %f429, %f427; + fma.rn.ftz.f32 %f311, %f310, %f429, %f426; + fma.rn.ftz.f32 %f312, %f311, %f429, %f425; + fma.rn.ftz.f32 %f313, %f312, %f429, %f424; + fma.rn.ftz.f32 %f314, %f313, %f429, %f423; + fma.rn.ftz.f32 %f315, %f314, %f429, %f422; + neg.f32 %f316, %f429; + selp.f32 %f317, %f316, %f16, %p23; + fma.rn.ftz.f32 %f430, %f315, %f317, %f317; + @%p24 bra $L__BB0_32; + ex2.approx.ftz.f32 %f318, %f430; + sub.f32 %f320, %f358, %f318; + mov.b32 %r47, %f320; + mov.b32 %r48, %f16; + and.b32 %r49, %r48, -2147483648; + or.b32 %r50, %r49, %r47; + mov.b32 %f430, %r50; +$L__BB0_32: + .loc 1 27 18 + mul.f32 %f321, %f8, 0f3F000000; + mul.f32 %f322, %f7, 0f3F000000; + mul.f32 %f323, %f6, 0f3F000000; + mul.f32 %f324, %f5, 0f3F000000; + mul.f32 %f325, %f4, 0f3F000000; + mul.f32 %f326, %f3, 0f3F000000; + mul.f32 %f327, %f2, 0f3F000000; + mul.f32 %f328, %f1, 0f3F000000; + .loc 1 32 18 + add.f32 %f329, %f367, 0f3F800000; + add.f32 %f330, %f376, 0f3F800000; + add.f32 %f331, %f385, 0f3F800000; + add.f32 %f332, %f394, 0f3F800000; + add.f32 %f333, %f403, 0f3F800000; + add.f32 %f334, %f412, 0f3F800000; + add.f32 %f335, %f421, 0f3F800000; + add.f32 %f336, %f430, 0f3F800000; + .loc 1 33 18 + mul.f32 %f337, %f328, %f329; + mul.f32 %f338, %f327, %f330; + mul.f32 %f339, %f326, %f331; + mul.f32 %f340, %f325, %f332; + mul.f32 %f341, %f324, %f333; + mul.f32 %f342, %f323, %f334; + mul.f32 %f343, %f322, %f335; + mul.f32 %f344, %f321, %f336; + .loc 1 35 40 + mov.b32 %r51, %f337; + cvt.rn.bf16.f32 %rs9, %r51; + mov.b32 %r52, %f338; + cvt.rn.bf16.f32 %rs10, %r52; + mov.b32 %r53, %f339; + cvt.rn.bf16.f32 %rs11, %r53; + mov.b32 %r54, %f340; + cvt.rn.bf16.f32 %rs12, %r54; + mov.b32 %r55, %f341; + cvt.rn.bf16.f32 %rs13, %r55; + mov.b32 %r56, %f342; + cvt.rn.bf16.f32 %rs14, %r56; + mov.b32 %r57, %f343; + cvt.rn.bf16.f32 %rs15, %r57; + mov.b32 %r58, %f344; + cvt.rn.bf16.f32 %rs16, %r58; + mov.b32 %r63, {%rs9, %rs10}; + mov.b32 %r64, {%rs11, %rs12}; + mov.b32 %r65, {%rs13, %rs14}; + mov.b32 %r66, {%rs15, %rs16}; + @%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r63, %r64, %r65, %r66 }; + .loc 1 35 4 + ret; +$L__tmp1: +$L__func_end0: + +} + // .globl __nv_erff +.visible .func (.param .b32 func_retval0) __nv_erff( + .param .b32 __nv_erff_param_0 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<5>; + .reg .f32 %f<49>; +$L__func_begin1: + + ld.param.f32 %f14, [__nv_erff_param_0]; + abs.ftz.f32 %f1, %f14; + setp.ge.f32 %p1, %f1, 0f3F8060FE; + mov.f32 %f46, 0f3789CA3C; + mov.f32 %f45, 0fB9F560B9; + mov.f32 %f44, 0f3BAC840B; + mov.f32 %f43, 0fBD0C8162; + mov.f32 %f42, 0f3E1CF906; + mov.f32 %f41, 0f3F6A937E; + mov.f32 %f40, 0f3F20D842; + mov.f32 %f47, %f1; + @%p1 bra $L__BB1_2; + mul.f32 %f47, %f14, %f14; + mov.f32 %f46, 0f38B1E96A; + mov.f32 %f45, 0fBA574D20; + mov.f32 %f44, 0f3BAAD5EA; + mov.f32 %f43, 0fBCDC1BE7; + mov.f32 %f42, 0f3DE718AF; + mov.f32 %f41, 0fBEC093AC; + mov.f32 %f40, 0f3E0375D3; +$L__BB1_2: + setp.ltu.f32 %p2, %f1, 0f3F8060FE; + fma.rn.ftz.f32 %f29, %f46, %f47, %f45; + fma.rn.ftz.f32 %f30, %f29, %f47, %f44; + fma.rn.ftz.f32 %f31, %f30, %f47, %f43; + fma.rn.ftz.f32 %f32, %f31, %f47, %f42; + fma.rn.ftz.f32 %f33, %f32, %f47, %f41; + fma.rn.ftz.f32 %f34, %f33, %f47, %f40; + neg.f32 %f35, %f47; + selp.f32 %f36, %f35, %f14, %p1; + fma.rn.ftz.f32 %f48, %f34, %f36, %f36; + @%p2 bra $L__BB1_4; + ex2.approx.ftz.f32 %f37, %f48; + mov.f32 %f38, 0f3F800000; + sub.f32 %f39, %f38, %f37; + mov.b32 %r1, %f39; + mov.b32 %r2, %f14; + and.b32 %r3, %r2, -2147483648; + or.b32 %r4, %r3, %r1; + mov.b32 %f48, %r4; +$L__BB1_4: + st.param.f32 [func_retval0+0], %f48; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/af/cafucwnmq4o436kwzkmrinerrnocxll7q6wsadcl726g6cradipo.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 172 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 97 +.b8 102 +.b8 117 +.b8 99 +.b8 119 +.b8 110 +.b8 109 +.b8 113 +.b8 52 +.b8 111 +.b8 52 +.b8 51 +.b8 54 +.b8 107 +.b8 119 +.b8 122 +.b8 107 +.b8 109 +.b8 114 +.b8 105 +.b8 110 +.b8 101 +.b8 114 +.b8 114 +.b8 110 +.b8 111 +.b8 99 +.b8 120 +.b8 108 +.b8 108 +.b8 55 +.b8 113 +.b8 54 +.b8 119 +.b8 115 +.b8 97 +.b8 100 +.b8 99 +.b8 108 +.b8 55 +.b8 50 +.b8 54 +.b8 103 +.b8 54 +.b8 99 +.b8 114 +.b8 97 +.b8 100 +.b8 105 +.b8 112 +.b8 111 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 97 +.b8 102 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 176 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 176 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..aa5ee41897a9e19b28af66673e146870a146c655 --- /dev/null +++ b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir @@ -0,0 +1,26 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked> + %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked> + %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked> + %8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> + %9 = arith.mulf %8, %cst_1 : tensor<1024xf32, #blocked> + %10 = arith.mulf %8, %cst_0 : tensor<1024xf32, #blocked> + %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked> + %12 = arith.addf %11, %cst : tensor<1024xf32, #blocked> + %13 = arith.mulf %9, %12 : tensor<1024xf32, #blocked> + %14 = arith.truncf %13 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> + tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..dad0184e8e826d75c09bb514f3cc2b556ca794fe --- /dev/null +++ b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir @@ -0,0 +1,25 @@ +module { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32> + %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32> + %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32> + %4 = arith.addi %3, %2 : tensor<1024xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16> + %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32> + %9 = arith.mulf %8, %cst_1 : tensor<1024xf32> + %10 = arith.mulf %8, %cst_0 : tensor<1024xf32> + %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32> + %12 = arith.addf %11, %cst : tensor<1024xf32> + %13 = arith.mulf %9, %12 : tensor<1024xf32> + %14 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16> + tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16> + tt.return + } +} diff --git a/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..9a60127ef9a1339be5a685a376ee20e664d01a42 Binary files /dev/null and b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin differ diff --git a/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..5d9bab6a92c0f5bd59a83f20f4665258f2e57ea1 --- /dev/null +++ b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir @@ -0,0 +1,290 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = and i32 %6, 31, !dbg !8 + %8 = lshr i32 %6, 5, !dbg !8 + %9 = and i32 %6, 7, !dbg !8 + %10 = shl nuw nsw i32 %9, 2, !dbg !8 + %11 = and i32 %8, 7, !dbg !9 + %12 = lshr i32 %7, 3, !dbg !9 + %13 = shl nuw nsw i32 %11, 2, !dbg !9 + %14 = or i32 %13, %12, !dbg !9 + %15 = or i32 %14, 96, !dbg !9 + %16 = or i32 %10, 1, !dbg !10 + %17 = or i32 %10, 2, !dbg !10 + %18 = or i32 %10, 3, !dbg !10 + %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !14 + %20 = shl i32 %19, 5, !dbg !15 + %21 = or i32 %20, %10, !dbg !16 + %22 = or i32 %20, %7, !dbg !16 + %23 = icmp ult i32 %15, 120, !dbg !17 + %24 = shl nuw nsw i32 %14, 17, !dbg !18 + %25 = or i32 %24, 4194304, !dbg !18 + %26 = or i32 %24, 8388608, !dbg !18 + %27 = shl nuw nsw i32 %15, 17, !dbg !18 + %28 = add i32 %21, %24, !dbg !19 + %29 = add i32 %25, %21, !dbg !19 + %30 = add i32 %26, %21, !dbg !19 + %31 = add i32 %21, %27, !dbg !19 + %32 = sext i32 %28 to i64, !dbg !20 + %33 = getelementptr float, ptr addrspace(1) %0, i64 %32, !dbg !20 + %34 = sext i32 %29 to i64, !dbg !20 + %35 = getelementptr float, ptr addrspace(1) %0, i64 %34, !dbg !20 + %36 = sext i32 %30 to i64, !dbg !20 + %37 = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !20 + %38 = sext i32 %31 to i64, !dbg !20 + %39 = getelementptr float, ptr addrspace(1) %0, i64 %38, !dbg !20 + %40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %33, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21 + %41 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !21 + %42 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !21 + %43 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !21 + %44 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !21 + %45 = bitcast i32 %41 to float, !dbg !21 + %46 = bitcast i32 %42 to float, !dbg !21 + %47 = bitcast i32 %43 to float, !dbg !21 + %48 = bitcast i32 %44 to float, !dbg !21 + %49 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21 + %50 = extractvalue { i32, i32, i32, i32 } %49, 0, !dbg !21 + %51 = extractvalue { i32, i32, i32, i32 } %49, 1, !dbg !21 + %52 = extractvalue { i32, i32, i32, i32 } %49, 2, !dbg !21 + %53 = extractvalue { i32, i32, i32, i32 } %49, 3, !dbg !21 + %54 = bitcast i32 %50 to float, !dbg !21 + %55 = bitcast i32 %51 to float, !dbg !21 + %56 = bitcast i32 %52 to float, !dbg !21 + %57 = bitcast i32 %53 to float, !dbg !21 + %58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21 + %59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !21 + %60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !21 + %61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !21 + %62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !21 + %63 = bitcast i32 %59 to float, !dbg !21 + %64 = bitcast i32 %60 to float, !dbg !21 + %65 = bitcast i32 %61 to float, !dbg !21 + %66 = bitcast i32 %62 to float, !dbg !21 + %67 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23) #3, !dbg !21 + %68 = extractvalue { i32, i32, i32, i32 } %67, 0, !dbg !21 + %69 = extractvalue { i32, i32, i32, i32 } %67, 1, !dbg !21 + %70 = extractvalue { i32, i32, i32, i32 } %67, 2, !dbg !21 + %71 = extractvalue { i32, i32, i32, i32 } %67, 3, !dbg !21 + %72 = bitcast i32 %68 to float, !dbg !21 + %73 = bitcast i32 %69 to float, !dbg !21 + %74 = bitcast i32 %70 to float, !dbg !21 + %75 = bitcast i32 %71 to float, !dbg !21 + %76 = fadd float %45, 0.000000e+00, !dbg !22 + %77 = fadd float %46, 0.000000e+00, !dbg !22 + %78 = fadd float %47, 0.000000e+00, !dbg !22 + %79 = fadd float %48, 0.000000e+00, !dbg !22 + %80 = fadd float %54, 0.000000e+00, !dbg !22 + %81 = fadd float %55, 0.000000e+00, !dbg !22 + %82 = fadd float %56, 0.000000e+00, !dbg !22 + %83 = fadd float %57, 0.000000e+00, !dbg !22 + %84 = fadd float %63, 0.000000e+00, !dbg !22 + %85 = fadd float %64, 0.000000e+00, !dbg !22 + %86 = fadd float %65, 0.000000e+00, !dbg !22 + %87 = fadd float %66, 0.000000e+00, !dbg !22 + %88 = fadd float %72, 0.000000e+00, !dbg !22 + %89 = fadd float %73, 0.000000e+00, !dbg !22 + %90 = fadd float %74, 0.000000e+00, !dbg !22 + %91 = fadd float %75, 0.000000e+00, !dbg !22 + %92 = select i1 %23, float %88, float 0.000000e+00, !dbg !23 + %93 = select i1 %23, float %89, float 0.000000e+00, !dbg !23 + %94 = select i1 %23, float %90, float 0.000000e+00, !dbg !23 + %95 = select i1 %23, float %91, float 0.000000e+00, !dbg !23 + %96 = fadd float %76, %80, !dbg !24 + %97 = fadd float %77, %81, !dbg !24 + %98 = fadd float %78, %82, !dbg !24 + %99 = fadd float %79, %83, !dbg !24 + %100 = fadd float %96, %84, !dbg !24 + %101 = fadd float %97, %85, !dbg !24 + %102 = fadd float %98, %86, !dbg !24 + %103 = fadd float %99, %87, !dbg !24 + %104 = fadd float %100, %92, !dbg !24 + %105 = fadd float %101, %93, !dbg !24 + %106 = fadd float %102, %94, !dbg !24 + %107 = fadd float %103, %95, !dbg !24 + %108 = bitcast float %104 to i32, !dbg !10 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !10 + %110 = bitcast i32 %109 to float, !dbg !10 + %111 = fadd float %104, %110, !dbg !24 + %112 = bitcast float %111 to i32, !dbg !10 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !10 + %114 = bitcast i32 %113 to float, !dbg !10 + %115 = fadd float %111, %114, !dbg !24 + %116 = bitcast float %105 to i32, !dbg !10 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !10 + %118 = bitcast i32 %117 to float, !dbg !10 + %119 = fadd float %105, %118, !dbg !24 + %120 = bitcast float %119 to i32, !dbg !10 + %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 8, i32 31), !dbg !10 + %122 = bitcast i32 %121 to float, !dbg !10 + %123 = fadd float %119, %122, !dbg !24 + %124 = bitcast float %106 to i32, !dbg !10 + %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 16, i32 31), !dbg !10 + %126 = bitcast i32 %125 to float, !dbg !10 + %127 = fadd float %106, %126, !dbg !24 + %128 = bitcast float %127 to i32, !dbg !10 + %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !10 + %130 = bitcast i32 %129 to float, !dbg !10 + %131 = fadd float %127, %130, !dbg !24 + %132 = bitcast float %107 to i32, !dbg !10 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !10 + %134 = bitcast i32 %133 to float, !dbg !10 + %135 = fadd float %107, %134, !dbg !24 + %136 = bitcast float %135 to i32, !dbg !10 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !10 + %138 = bitcast i32 %137 to float, !dbg !10 + %139 = fadd float %135, %138, !dbg !24 + %140 = icmp ult i32 %7, 8, !dbg !10 + %141 = shl nuw nsw i32 %9, 5, !dbg !10 + %142 = or i32 %141, %11, !dbg !10 + %143 = zext nneg i32 %142 to i64, !dbg !10 + %144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %144, float %115, i1 %140) #3, !dbg !10 + %145 = shl nuw nsw i32 %16, 3, !dbg !10 + %146 = or i32 %145, %11, !dbg !10 + %147 = zext nneg i32 %146 to i64, !dbg !10 + %148 = getelementptr float, ptr addrspace(3) @global_smem, i64 %147, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %148, float %123, i1 %140) #3, !dbg !10 + %149 = shl nuw nsw i32 %17, 3, !dbg !10 + %150 = or i32 %149, %11, !dbg !10 + %151 = zext nneg i32 %150 to i64, !dbg !10 + %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %152, float %131, i1 %140) #3, !dbg !10 + %153 = shl nuw nsw i32 %18, 3, !dbg !10 + %154 = or i32 %153, %11, !dbg !10 + %155 = zext nneg i32 %154 to i64, !dbg !10 + %156 = getelementptr float, ptr addrspace(3) @global_smem, i64 %155, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %156, float %139, i1 %140) #3, !dbg !10 + tail call void @llvm.nvvm.barrier0(), !dbg !10 + %157 = icmp slt i32 %6, 256, !dbg !10 + %158 = sext i32 %6 to i64, !dbg !10 + %159 = getelementptr float, ptr addrspace(3) @global_smem, i64 %158, !dbg !10 + %160 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %159, i1 %157) #3, !dbg !10 + %161 = bitcast float %160 to i32, !dbg !10 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !10 + %163 = bitcast i32 %162 to float, !dbg !10 + %164 = fadd float %160, %163, !dbg !24 + %165 = bitcast float %164 to i32, !dbg !10 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !10 + %167 = bitcast i32 %166 to float, !dbg !10 + %168 = fadd float %164, %167, !dbg !24 + %169 = bitcast float %168 to i32, !dbg !10 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 1, i32 31), !dbg !10 + %171 = bitcast i32 %170 to float, !dbg !10 + %172 = fadd float %168, %171, !dbg !24 + %173 = icmp eq i32 %9, 0, !dbg !10 + %174 = and i1 %157, %173, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %159, float %172, i1 %174) #3, !dbg !10 + tail call void @llvm.nvvm.barrier0(), !dbg !10 + %175 = zext nneg i32 %141 to i64, !dbg !10 + %176 = getelementptr float, ptr addrspace(3) @global_smem, i64 %175, !dbg !10 + %177 = load float, ptr addrspace(3) %176, align 4, !dbg !10 + %178 = zext nneg i32 %145 to i64, !dbg !10 + %179 = getelementptr float, ptr addrspace(3) @global_smem, i64 %178, !dbg !10 + %180 = load float, ptr addrspace(3) %179, align 4, !dbg !10 + %181 = zext nneg i32 %149 to i64, !dbg !10 + %182 = getelementptr float, ptr addrspace(3) @global_smem, i64 %181, !dbg !10 + %183 = load float, ptr addrspace(3) %182, align 4, !dbg !10 + %184 = zext nneg i32 %153 to i64, !dbg !10 + %185 = getelementptr float, ptr addrspace(3) @global_smem, i64 %184, !dbg !10 + %186 = load float, ptr addrspace(3) %185, align 4, !dbg !10 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %187 = zext nneg i32 %10 to i64, !dbg !28 + %188 = getelementptr float, ptr addrspace(3) @global_smem, i64 %187, !dbg !28 + %189 = insertelement <1 x float> undef, float %177, i64 0, !dbg !28 + store <1 x float> %189, ptr addrspace(3) %188, align 4, !dbg !28 + %190 = zext nneg i32 %16 to i64, !dbg !28 + %191 = getelementptr float, ptr addrspace(3) @global_smem, i64 %190, !dbg !28 + %192 = insertelement <1 x float> undef, float %180, i64 0, !dbg !28 + store <1 x float> %192, ptr addrspace(3) %191, align 4, !dbg !28 + %193 = zext nneg i32 %17 to i64, !dbg !28 + %194 = getelementptr float, ptr addrspace(3) @global_smem, i64 %193, !dbg !28 + %195 = insertelement <1 x float> undef, float %183, i64 0, !dbg !28 + store <1 x float> %195, ptr addrspace(3) %194, align 4, !dbg !28 + %196 = zext nneg i32 %18 to i64, !dbg !28 + %197 = getelementptr float, ptr addrspace(3) @global_smem, i64 %196, !dbg !28 + %198 = insertelement <1 x float> undef, float %186, i64 0, !dbg !28 + store <1 x float> %198, ptr addrspace(3) %197, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %199 = zext nneg i32 %7 to i64, !dbg !28 + %200 = getelementptr float, ptr addrspace(3) @global_smem, i64 %199, !dbg !28 + %201 = load <1 x float>, ptr addrspace(3) %200, align 4, !dbg !28 + %.frozen = freeze i32 %22 + %202 = sdiv i32 %.frozen, 256, !dbg !29 + %203 = mul i32 %202, 256 + %.decomposed = sub i32 %.frozen, %203 + %204 = sext i32 %202 to i64, !dbg !30 + %205 = getelementptr i64, ptr addrspace(1) %1, i64 %204, !dbg !30 + %206 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %205, i1 true) #3, !dbg !31 + %207 = lshr i64 %206, 54, !dbg !32 + %208 = and i64 %207, 512, !dbg !32 + %209 = add i64 %208, %206, !dbg !32 + %210 = shl i64 %209, 8, !dbg !33 + %211 = sext i32 %.decomposed to i64, !dbg !34 + %212 = getelementptr float, ptr addrspace(1) %2, i64 %210, !dbg !35 + %213 = getelementptr float, ptr addrspace(1) %212, i64 %211, !dbg !35 + %214 = icmp eq i32 %11, 0, !dbg !36 + %215 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %213, <1 x float> %201, i1 %214) #3, !dbg !36 + ret void, !dbg !37 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i") +!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 243, column: 36, scope: !11, inlinedAt: !13) +!11 = distinct !DILexicalBlockFile(scope: !5, file: !12, discriminator: 0) +!12 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!13 = !DILocation(line: 35, column: 25, scope: !11) +!14 = !DILocation(line: 21, column: 28, scope: !5) +!15 = !DILocation(line: 21, column: 33, scope: !5) +!16 = !DILocation(line: 22, column: 23, scope: !5) +!17 = !DILocation(line: 29, column: 25, scope: !5) +!18 = !DILocation(line: 31, column: 47, scope: !5) +!19 = !DILocation(line: 31, column: 40, scope: !5) +!20 = !DILocation(line: 31, column: 34, scope: !5) +!21 = !DILocation(line: 31, column: 53, scope: !5) +!22 = !DILocation(line: 33, column: 23, scope: !5) +!23 = !DILocation(line: 34, column: 38, scope: !5) +!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26) +!25 = distinct !DILexicalBlockFile(scope: !11, file: !12, discriminator: 0) +!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27) +!27 = !DILocation(line: 35, column: 25, scope: !25) +!28 = !DILocation(line: 35, column: 28, scope: !5) +!29 = !DILocation(line: 36, column: 20, scope: !5) +!30 = !DILocation(line: 38, column: 30, scope: !5) +!31 = !DILocation(line: 38, column: 35, scope: !5) +!32 = !DILocation(line: 41, column: 32, scope: !5) +!33 = !DILocation(line: 45, column: 40, scope: !5) +!34 = !DILocation(line: 45, column: 36, scope: !5) +!35 = !DILocation(line: 45, column: 30, scope: !5) +!36 = !DILocation(line: 45, column: 55, scope: !5) +!37 = !DILocation(line: 45, column: 4, scope: !5) diff --git a/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..bd9dd69cfa519de6efe25f21d9290d1f30494411 --- /dev/null +++ b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx @@ -0,0 +1,653 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4e +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4e( + .param .u64 triton__0d1d2d3de4e_param_0, + .param .u64 triton__0d1d2d3de4e_param_1, + .param .u64 triton__0d1d2d3de4e_param_2, + .param .u32 triton__0d1d2d3de4e_param_3, + .param .u32 triton__0d1d2d3de4e_param_4 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<30>; + .reg .b32 %r<112>; + .reg .f32 %f<76>; + .reg .b64 %rd<22>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd8, [triton__0d1d2d3de4e_param_0]; + ld.param.u64 %rd9, [triton__0d1d2d3de4e_param_1]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r48, %tid.x; + and.b32 %r49, %r48, 31; + ld.param.u64 %rd10, [triton__0d1d2d3de4e_param_2]; + and.b32 %r50, %r48, 7; + shl.b32 %r51, %r50, 2; + .loc 1 24 33 + bfe.u32 %r52, %r48, 5, 3; + bfe.u32 %r53, %r48, 3, 2; + shl.b32 %r54, %r52, 2; + or.b32 %r55, %r54, %r53; + or.b32 %r56, %r55, 96; + .loc 1 21 28 + mov.u32 %r1, %ctaid.x; + .loc 1 21 33 + shl.b32 %r57, %r1, 5; + .loc 1 22 23 + or.b32 %r58, %r57, %r51; + or.b32 %r59, %r57, %r49; + .loc 1 29 25 + setp.lt.u32 %p16, %r56, 120; + .loc 1 31 47 + shl.b32 %r60, %r55, 17; + shl.b32 %r61, %r56, 17; + .loc 1 31 40 + add.s32 %r62, %r58, %r60; + add.s32 %r63, %r62, 4194304; + add.s32 %r64, %r62, 8388608; + add.s32 %r65, %r58, %r61; + .loc 1 31 34 + mul.wide.s32 %rd11, %r62, 4; + add.s64 %rd1, %rd8, %rd11; + mul.wide.s32 %rd12, %r63, 4; + add.s64 %rd2, %rd8, %rd12; + mul.wide.s32 %rd13, %r64, 4; + add.s64 %rd3, %rd8, %rd13; + mul.wide.s32 %rd14, %r65, 4; + add.s64 %rd4, %rd8, %rd14; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 31 53 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.b32 %f1, %r2; + mov.b32 %f2, %r3; + mov.b32 %f3, %r4; + mov.b32 %f4, %r5; + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r6; + @!%p1 mov.u32 %r11, %r6; + @!%p1 mov.u32 %r12, %r6; + @!%p1 mov.u32 %r13, %r6; + mov.b32 %f5, %r10; + mov.b32 %f6, %r11; + mov.b32 %f7, %r12; + mov.b32 %f8, %r13; + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r6; + @!%p1 mov.u32 %r19, %r6; + @!%p1 mov.u32 %r20, %r6; + @!%p1 mov.u32 %r21, %r6; + mov.b32 %f9, %r18; + mov.b32 %f10, %r19; + mov.b32 %f11, %r20; + mov.b32 %f12, %r21; + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + mov.u32 %r29, 0x0; + @%p16 ld.global.L1::evict_first.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ]; + @!%p16 mov.u32 %r26, %r6; + @!%p16 mov.u32 %r27, %r6; + @!%p16 mov.u32 %r28, %r6; + @!%p16 mov.u32 %r29, %r6; + mov.b32 %f13, %r26; + mov.b32 %f14, %r27; + mov.b32 %f15, %r28; + mov.b32 %f16, %r29; + .loc 1 33 23 + add.f32 %f17, %f1, 0f00000000; + add.f32 %f18, %f2, 0f00000000; + add.f32 %f19, %f3, 0f00000000; + add.f32 %f20, %f4, 0f00000000; + add.f32 %f21, %f5, 0f00000000; + add.f32 %f22, %f6, 0f00000000; + add.f32 %f23, %f7, 0f00000000; + add.f32 %f24, %f8, 0f00000000; + add.f32 %f25, %f9, 0f00000000; + add.f32 %f26, %f10, 0f00000000; + add.f32 %f27, %f11, 0f00000000; + add.f32 %f28, %f12, 0f00000000; + add.f32 %f29, %f13, 0f00000000; + add.f32 %f30, %f14, 0f00000000; + add.f32 %f31, %f15, 0f00000000; + add.f32 %f32, %f16, 0f00000000; + .loc 1 34 38 + selp.f32 %f33, %f29, 0f00000000, %p16; + selp.f32 %f34, %f30, 0f00000000, %p16; + selp.f32 %f35, %f31, 0f00000000, %p16; + selp.f32 %f36, %f32, 0f00000000, %p16; +$L__tmp1: + .loc 2 233 15 + add.f32 %f37, %f17, %f21; + add.f32 %f38, %f18, %f22; + add.f32 %f39, %f19, %f23; + add.f32 %f40, %f20, %f24; + add.f32 %f41, %f37, %f25; + add.f32 %f42, %f38, %f26; + add.f32 %f43, %f39, %f27; + add.f32 %f44, %f40, %f28; + add.f32 %f45, %f41, %f33; + add.f32 %f46, %f42, %f34; + add.f32 %f47, %f43, %f35; + add.f32 %f48, %f44, %f36; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r66, %f45; + shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1; + mov.b32 %f49, %r67; +$L__tmp3: + .loc 2 233 15 + add.f32 %f50, %f45, %f49; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r68, %f50; + shfl.sync.bfly.b32 %r69, %r68, 8, 31, -1; + mov.b32 %f51, %r69; +$L__tmp5: + .loc 2 233 15 + add.f32 %f52, %f50, %f51; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r70, %f46; + shfl.sync.bfly.b32 %r71, %r70, 16, 31, -1; + mov.b32 %f53, %r71; +$L__tmp7: + .loc 2 233 15 + add.f32 %f54, %f46, %f53; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r72, %f54; + shfl.sync.bfly.b32 %r73, %r72, 8, 31, -1; + mov.b32 %f55, %r73; +$L__tmp9: + .loc 2 233 15 + add.f32 %f56, %f54, %f55; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r74, %f47; + shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1; + mov.b32 %f57, %r75; +$L__tmp11: + .loc 2 233 15 + add.f32 %f58, %f47, %f57; +$L__tmp12: + .loc 2 243 36 + mov.b32 %r76, %f58; + shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1; + mov.b32 %f59, %r77; +$L__tmp13: + .loc 2 233 15 + add.f32 %f60, %f58, %f59; +$L__tmp14: + .loc 2 243 36 + mov.b32 %r78, %f48; + shfl.sync.bfly.b32 %r79, %r78, 16, 31, -1; + mov.b32 %f61, %r79; +$L__tmp15: + .loc 2 233 15 + add.f32 %f62, %f48, %f61; +$L__tmp16: + .loc 2 243 36 + mov.b32 %r80, %f62; + shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1; + mov.b32 %f63, %r81; +$L__tmp17: + .loc 2 233 15 + add.f32 %f64, %f62, %f63; +$L__tmp18: + .loc 2 243 36 + setp.lt.u32 %p21, %r49, 8; + shl.b32 %r82, %r50, 7; + or.b32 %r83, %r82, %r54; + mov.u32 %r84, global_smem; + add.s32 %r34, %r84, %r83; + mov.b32 %r35, %f52; + @%p21 st.shared.b32 [ %r34 + 0 ], %r35; + or.b32 %r85, %r82, 32; + or.b32 %r86, %r85, %r54; + add.s32 %r36, %r84, %r86; + mov.b32 %r37, %f56; + @%p21 st.shared.b32 [ %r36 + 0 ], %r37; + or.b32 %r87, %r82, 64; + or.b32 %r88, %r87, %r54; + add.s32 %r38, %r84, %r88; + mov.b32 %r39, %f60; + @%p21 st.shared.b32 [ %r38 + 0 ], %r39; + or.b32 %r89, %r82, 96; + or.b32 %r90, %r89, %r54; + add.s32 %r40, %r84, %r90; + mov.b32 %r41, %f64; + @%p21 st.shared.b32 [ %r40 + 0 ], %r41; + bar.sync 0; + setp.lt.s32 %p25, %r48, 256; + shl.b32 %r91, %r48, 2; + add.s32 %r43, %r84, %r91; + @%p25 ld.shared.b32 %r42, [ %r43 + 0 ]; + mov.b32 %f65, %r42; + shfl.sync.bfly.b32 %r92, %r42, 4, 31, -1; + mov.b32 %f66, %r92; +$L__tmp19: + .loc 2 233 15 + add.f32 %f67, %f65, %f66; +$L__tmp20: + .loc 2 243 36 + mov.b32 %r93, %f67; + shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1; + mov.b32 %f68, %r94; +$L__tmp21: + .loc 2 233 15 + add.f32 %f69, %f67, %f68; +$L__tmp22: + .loc 2 243 36 + mov.b32 %r95, %f69; + shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1; + mov.b32 %f70, %r96; +$L__tmp23: + .loc 2 233 15 + add.f32 %f71, %f69, %f70; +$L__tmp24: + .loc 2 243 36 + setp.eq.s32 %p29, %r50, 0; + and.pred %p26, %p25, %p29; + mov.b32 %r45, %f71; + @%p26 st.shared.b32 [ %r43 + 0 ], %r45; + bar.sync 0; + add.s32 %r97, %r84, %r82; + ld.shared.f32 %f72, [%r97]; + add.s32 %r98, %r84, %r85; + ld.shared.f32 %f73, [%r98]; + add.s32 %r99, %r84, %r87; + ld.shared.f32 %f74, [%r99]; + add.s32 %r100, %r84, %r89; + ld.shared.f32 %f75, [%r100]; +$L__tmp25: + .loc 1 35 28 + bar.sync 0; + shl.b32 %r101, %r50, 4; + add.s32 %r102, %r84, %r101; + st.shared.f32 [%r102], %f72; + st.shared.f32 [%r102+4], %f73; + st.shared.f32 [%r102+8], %f74; + st.shared.f32 [%r102+12], %f75; + bar.sync 0; + shl.b32 %r103, %r49, 2; + add.s32 %r104, %r84, %r103; + .loc 1 36 20 + shr.s32 %r106, %r59, 31; + shr.u32 %r107, %r106, 24; + add.s32 %r108, %r59, %r107; + shr.s32 %r109, %r108, 8; + and.b32 %r110, %r108, -256; + sub.s32 %r111, %r59, %r110; + .loc 1 38 30 + mul.wide.s32 %rd15, %r109, 8; + add.s64 %rd6, %rd9, %rd15; + .loc 1 45 55 + ld.shared.u32 %r47, [%r104]; + .loc 1 38 35 + mov.u64 %rd5, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd5 }, [ %rd6 + 0 ]; + .loc 1 41 32 + shr.u64 %rd16, %rd5, 54; + and.b64 %rd17, %rd16, 512; + add.s64 %rd18, %rd17, %rd5; + .loc 1 45 30 + shl.b64 %rd19, %rd18, 10; + add.s64 %rd20, %rd10, %rd19; + mul.wide.s32 %rd21, %r111, 4; + add.s64 %rd7, %rd20, %rd21; + .loc 1 45 55 + setp.eq.s32 %p28, %r52, 0; + mov.u32 %r46, 0x0; + @%p28 atom.global.gpu.acq_rel.add.f32 %r46, [ %rd7 + 0 ], %r47; + .loc 1 45 4 + ret; +$L__tmp26: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 264 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 105 +.b8 107 +.b8 53 +.b8 118 +.b8 120 +.b8 55 +.b8 112 +.b8 50 +.b8 50 +.b8 102 +.b8 112 +.b8 107 +.b8 52 +.b8 100 +.b8 99 +.b8 118 +.b8 104 +.b8 53 +.b8 53 +.b8 122 +.b8 105 +.b8 109 +.b8 119 +.b8 52 +.b8 116 +.b8 53 +.b8 110 +.b8 114 +.b8 53 +.b8 122 +.b8 110 +.b8 50 +.b8 98 +.b8 55 +.b8 105 +.b8 110 +.b8 117 +.b8 106 +.b8 120 +.b8 106 +.b8 97 +.b8 117 +.b8 120 +.b8 115 +.b8 104 +.b8 108 +.b8 106 +.b8 117 +.b8 109 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 105 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp24 +.b8 2 +.b8 35 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp24 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp25 +.b8 2 +.b8 35 +.b8 25 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fd110b3da99f6cd0f9c7f8441eaa1182137be8b3 --- /dev/null +++ b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir @@ -0,0 +1,60 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<32x1xi64, #blocked> + %cst_0 = arith.constant dense<0> : tensor<32x1xi64, #blocked> + %cst_1 = arith.constant dense<512> : tensor<32x1xi64, #blocked> + %cst_2 = arith.constant dense<256> : tensor<32x1xi32, #blocked> + %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1> + %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1> + %cst_5 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked1> + %cst_6 = arith.constant dense : tensor<32x1xi1, #blocked> + %c32_i32 = arith.constant 32 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c32_i32 : i32 + %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %3 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32x1xi32, #blocked1> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xi32, #blocked> + %6 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked1> + %7 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked> + %8 = arith.addi %6, %4 : tensor<32x1xi32, #blocked1> + %9 = arith.addi %7, %5 : tensor<32x1xi32, #blocked> + %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1> + %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1> + %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1> + %14 = tt.broadcast %8 : (tensor<32x1xi32, #blocked1>) -> tensor<32x128xi32, #blocked1> + %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<32x128xi32, #blocked1> + %16 = arith.addi %14, %15 : tensor<32x128xi32, #blocked1> + %17 = tt.splat %arg0 : (!tt.ptr) -> tensor<32x128x!tt.ptr, #blocked1> + %18 = tt.addptr %17, %16 : tensor<32x128x!tt.ptr, #blocked1>, tensor<32x128xi32, #blocked1> + %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<32x128xi1, #blocked1> + %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32, #blocked1> + %21 = arith.addf %20, %cst_5 : tensor<32x128xf32, #blocked1> + %22 = arith.select %19, %21, %cst_5 : tensor<32x128xi1, #blocked1>, tensor<32x128xf32, #blocked1> + %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %40 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %40 : f32 + }) : (tensor<32x128xf32, #blocked1>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %24 = triton_gpu.convert_layout %23 : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xf32, #blocked> + %26 = arith.divsi %9, %cst_2 : tensor<32x1xi32, #blocked> + %27 = arith.remsi %9, %cst_2 : tensor<32x1xi32, #blocked> + %28 = tt.splat %arg1 : (!tt.ptr) -> tensor<32x1x!tt.ptr, #blocked> + %29 = tt.addptr %28, %26 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> + %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64, #blocked> + %31 = arith.addi %30, %cst_1 : tensor<32x1xi64, #blocked> + %32 = arith.cmpi slt, %30, %cst_0 : tensor<32x1xi64, #blocked> + %33 = arith.select %32, %31, %30 : tensor<32x1xi1, #blocked>, tensor<32x1xi64, #blocked> + %34 = arith.muli %33, %cst : tensor<32x1xi64, #blocked> + %35 = arith.extsi %27 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> + %36 = arith.addi %35, %34 : tensor<32x1xi64, #blocked> + %37 = tt.splat %arg2 : (!tt.ptr) -> tensor<32x1x!tt.ptr, #blocked> + %38 = tt.addptr %37, %36 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi64, #blocked> + %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xf32, #blocked>, tensor<32x1xi1, #blocked>) -> tensor<32x1xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c07a6e0066a7250123e6f73cc8f95e6913969c8c --- /dev/null +++ b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir @@ -0,0 +1,53 @@ +module { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<32x1xi64> + %cst_0 = arith.constant dense<0> : tensor<32x1xi64> + %cst_1 = arith.constant dense<512> : tensor<32x1xi64> + %cst_2 = arith.constant dense : tensor<32x1xi1> + %cst_3 = arith.constant dense<256> : tensor<32x1xi32> + %cst_4 = arith.constant dense<131072> : tensor<1x128xi32> + %cst_5 = arith.constant dense<120> : tensor<1x128xi32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> + %c32_i32 = arith.constant 32 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c32_i32 : i32 + %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32>) -> tensor<32x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<32x1xi32> + %5 = arith.addi %4, %3 : tensor<32x1xi32> + %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32> + %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32> + %9 = arith.muli %7, %cst_4 : tensor<1x128xi32> + %10 = tt.broadcast %5 : (tensor<32x1xi32>) -> tensor<32x128xi32> + %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<32x128xi32> + %12 = arith.addi %10, %11 : tensor<32x128xi32> + %13 = tt.splat %arg0 : (!tt.ptr) -> tensor<32x128x!tt.ptr> + %14 = tt.addptr %13, %12 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> + %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<32x128xi1> + %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32> + %17 = arith.addf %16, %cst_6 : tensor<32x128xf32> + %18 = arith.select %15, %17, %cst_6 : tensor<32x128xi1>, tensor<32x128xf32> + %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %35 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %35 : f32 + }) : (tensor<32x128xf32>) -> tensor<32xf32> + %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<32xf32>) -> tensor<32x1xf32> + %21 = arith.divsi %5, %cst_3 : tensor<32x1xi32> + %22 = arith.remsi %5, %cst_3 : tensor<32x1xi32> + %23 = tt.splat %arg1 : (!tt.ptr) -> tensor<32x1x!tt.ptr> + %24 = tt.addptr %23, %21 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> + %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64> + %26 = arith.addi %25, %cst_1 : tensor<32x1xi64> + %27 = arith.cmpi slt, %25, %cst_0 : tensor<32x1xi64> + %28 = arith.select %27, %26, %25 : tensor<32x1xi1>, tensor<32x1xi64> + %29 = arith.muli %28, %cst : tensor<32x1xi64> + %30 = arith.extsi %22 : tensor<32x1xi32> to tensor<32x1xi64> + %31 = arith.addi %30, %29 : tensor<32x1xi64> + %32 = tt.splat %arg2 : (!tt.ptr) -> tensor<32x1x!tt.ptr> + %33 = tt.addptr %32, %31 : tensor<32x1x!tt.ptr>, tensor<32x1xi64> + %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr>, tensor<32x1xf32>, tensor<32x1xi1>) -> tensor<32x1xf32> + tt.return + } +} diff --git a/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8dfc219ffbb3021e19f3b35e9be96086e23c9c4b --- /dev/null +++ b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir @@ -0,0 +1,24 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1> + %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked> + %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1> + %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked> + %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1> + %8 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked> + %11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1> + %12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1> + %13 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked1> + %14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr, #blocked1>, tensor<1024xi32, #blocked1> + tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1> + tt.return + } +} diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..99cbb9a04781180a9c95f69e97cd58ef5f81890f Binary files /dev/null and b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin differ diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..d72a62515fa876a92b19f5e1298f1a6c96bc6045 --- /dev/null +++ b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir @@ -0,0 +1,63 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked> + %cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked> + %cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked> + %cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c128_i32 = arith.constant 128 : i32 + %c8_i32 = arith.constant 8 : i32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1> + %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked> + %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1> + %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked> + %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1> + %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked> + %12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked> + %13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked> + %14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked> + %16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %17 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %18 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 { + %25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked> + %26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked> + %27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked> + %28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked> + %29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked> + %31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked> + %32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked> + %34 = tt.load %32, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked> + %35 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %36 = tt.load %35, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked> + %37 = arith.mulf %34, %36 : tensor<64x8xf32, #blocked> + %38 = arith.addf %arg6, %37 : tensor<64x8xf32, #blocked> + %39 = arith.select %33, %38, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> + scf.yield %39 : tensor<64x8xf32, #blocked> + } + %20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %25 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %25 : f32 + }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1> + %23 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> + %24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> + tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1> + tt.return + } +} diff --git a/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..185a499db8070473dada441e0fe7d45e7bd06602 --- /dev/null +++ b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx @@ -0,0 +1,577 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4de( + .param .u64 triton__0d1d2d3de4de_param_0, + .param .u64 triton__0d1d2d3de4de_param_1, + .param .u64 triton__0d1d2d3de4de_param_2, + .param .u32 triton__0d1d2d3de4de_param_3, + .param .u32 triton__0d1d2d3de4de_param_4 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<20>; + .reg .b16 %rs<5>; + .reg .b32 %r<98>; + .reg .f32 %f<47>; + .reg .b64 %rd<10>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2]; + ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1]; + ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + shl.b32 %r13, %r1, 2; + and.b32 %r3, %r13, 60; + .loc 1 24 33 + bfe.u32 %r4, %r1, 5, 2; + .loc 1 21 28 + mov.u32 %r11, %ctaid.x; + .loc 1 21 33 + shl.b32 %r5, %r11, 6; + .loc 1 22 23 + or.b32 %r14, %r5, %r3; + .loc 1 26 20 + shr.s32 %r16, %r14, 31; + shr.u32 %r17, %r16, 24; + add.s32 %r18, %r14, %r17; + shr.s32 %r19, %r18, 8; + .loc 1 29 36 + mad.lo.s32 %r20, %r19, 32512, %r14; + shl.b32 %r21, %r4, 9; + add.s32 %r22, %r20, %r21; + shl.b32 %r23, %r1, 4; + and.b32 %r24, %r23, 256; + add.s32 %r96, %r22, %r24; + mov.f32 %f43, 0f00000000; + mov.b32 %r97, -8; + mov.pred %p1, -1; + mov.f32 %f44, %f43; + mov.f32 %f45, %f43; + mov.f32 %f46, %f43; +$L__BB0_1: + .loc 1 33 34 + mul.wide.s32 %rd6, %r96, 2; + add.s64 %rd4, %rd1, %rd6; + mov.b32 %r27, 0; + .loc 1 33 63 + mov.u32 %r25, 0x0; + mov.u32 %r26, 0x0; + @%p1 ld.global.L1::evict_first.v2.b32 { %r25, %r26 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r25, %r27; + @!%p1 mov.u32 %r26, %r27; + cvt.u16.u32 %rs1, %r25; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r25; } + cvt.u16.u32 %rs3, %r26; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r26; } + .loc 1 33 115 + cvt.f32.bf16 %r29, %rs1; + mov.b32 %f13, %r29; + cvt.f32.bf16 %r30, %rs2; + mov.b32 %f14, %r30; + cvt.f32.bf16 %r31, %rs3; + mov.b32 %f15, %r31; + cvt.f32.bf16 %r32, %rs4; + mov.b32 %f16, %r32; + .loc 1 34 34 + mul.wide.s32 %rd7, %r96, 4; + add.s64 %rd5, %rd2, %rd7; + .loc 1 34 63 + mov.u32 %r33, 0x0; + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ]; + @!%p1 mov.u32 %r33, %r27; + @!%p1 mov.u32 %r34, %r27; + @!%p1 mov.u32 %r35, %r27; + @!%p1 mov.u32 %r36, %r27; + mov.b32 %f17, %r33; + mov.b32 %f18, %r34; + mov.b32 %f19, %r35; + mov.b32 %f20, %r36; + .loc 1 39 38 + fma.rn.f32 %f46, %f16, %f20, %f46; + fma.rn.f32 %f45, %f15, %f19, %f45; + fma.rn.f32 %f44, %f14, %f18, %f44; + fma.rn.f32 %f43, %f13, %f17, %f43; + .loc 1 29 36 + add.s32 %r97, %r97, 8; + add.s32 %r96, %r96, 2048; + setp.lt.u32 %p9, %r97, 120; + @%p9 bra $L__BB0_1; + .loc 1 22 44 + and.b32 %r58, %r1, 63; + .loc 1 22 23 + or.b32 %r59, %r5, %r58; +$L__tmp1: + .loc 2 243 36 + mov.b32 %r60, %f43; + shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1; + mov.b32 %f21, %r61; +$L__tmp2: + .loc 2 233 15 + add.f32 %f22, %f43, %f21; +$L__tmp3: + .loc 2 243 36 + mov.b32 %r62, %f44; + shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1; + mov.b32 %f23, %r63; +$L__tmp4: + .loc 2 233 15 + add.f32 %f24, %f44, %f23; +$L__tmp5: + .loc 2 243 36 + mov.b32 %r64, %f45; + shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1; + mov.b32 %f25, %r65; +$L__tmp6: + .loc 2 233 15 + add.f32 %f26, %f45, %f25; +$L__tmp7: + .loc 2 243 36 + mov.b32 %r66, %f46; + shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1; + mov.b32 %f27, %r67; +$L__tmp8: + .loc 2 233 15 + add.f32 %f28, %f46, %f27; +$L__tmp9: + .loc 2 243 36 + setp.lt.u32 %p10, %r2, 16; + shl.b32 %r68, %r3, 2; + or.b32 %r69, %r68, %r4; + shl.b32 %r70, %r69, 2; + mov.u32 %r71, global_smem; + add.s32 %r41, %r71, %r70; + mov.b32 %r42, %f22; + @%p10 st.shared.b32 [ %r41 + 0 ], %r42; + shl.b32 %r72, %r4, 2; + shl.b32 %r73, %r3, 4; + or.b32 %r74, %r73, 16; + or.b32 %r75, %r74, %r72; + add.s32 %r43, %r71, %r75; + mov.b32 %r44, %f24; + @%p10 st.shared.b32 [ %r43 + 0 ], %r44; + or.b32 %r76, %r73, 32; + or.b32 %r77, %r76, %r72; + add.s32 %r45, %r71, %r77; + mov.b32 %r46, %f26; + @%p10 st.shared.b32 [ %r45 + 0 ], %r46; + or.b32 %r78, %r73, 48; + or.b32 %r79, %r78, %r72; + add.s32 %r47, %r71, %r79; + mov.b32 %r48, %f28; + @%p10 st.shared.b32 [ %r47 + 0 ], %r48; + bar.sync 0; + setp.lt.s32 %p14, %r1, 256; + add.s32 %r50, %r71, %r13; + @%p14 ld.shared.b32 %r49, [ %r50 + 0 ]; + mov.b32 %f29, %r49; + shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1; + mov.b32 %f30, %r81; +$L__tmp10: + .loc 2 233 15 + add.f32 %f31, %f29, %f30; +$L__tmp11: + .loc 2 243 36 + mov.b32 %r82, %f31; + shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1; + mov.b32 %f32, %r83; +$L__tmp12: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp13: + .loc 2 243 36 + and.b32 %r84, %r1, 3; + setp.eq.s32 %p19, %r84, 0; + and.pred %p15, %p14, %p19; + mov.b32 %r52, %f33; + @%p15 st.shared.b32 [ %r50 + 0 ], %r52; + add.s32 %r54, %r50, 512; + @%p14 ld.shared.b32 %r53, [ %r54 + 0 ]; + mov.b32 %f34, %r53; + shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1; + mov.b32 %f35, %r85; +$L__tmp14: + .loc 2 233 15 + add.f32 %f36, %f34, %f35; +$L__tmp15: + .loc 2 243 36 + mov.b32 %r86, %f36; + shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1; + mov.b32 %f37, %r87; +$L__tmp16: + .loc 2 233 15 + add.f32 %f38, %f36, %f37; +$L__tmp17: + .loc 2 243 36 + mov.b32 %r56, %f38; + @%p15 st.shared.b32 [ %r54 + 0 ], %r56; + bar.sync 0; + add.s32 %r88, %r71, %r73; + ld.shared.f32 %f39, [%r88]; + add.s32 %r89, %r71, %r74; + ld.shared.f32 %f40, [%r89]; + add.s32 %r90, %r71, %r76; + ld.shared.f32 %f41, [%r90]; + add.s32 %r91, %r71, %r78; + ld.shared.f32 %f42, [%r91]; +$L__tmp18: + .loc 1 40 28 + bar.sync 0; + add.s32 %r92, %r71, %r68; + st.shared.f32 [%r92], %f39; + st.shared.f32 [%r92+4], %f40; + st.shared.f32 [%r92+8], %f41; + st.shared.f32 [%r92+12], %f42; + bar.sync 0; + shl.b32 %r93, %r58, 2; + add.s32 %r94, %r71, %r93; + ld.shared.u32 %r57, [%r94]; + .loc 1 41 25 + mul.wide.s32 %rd9, %r59, 4; + add.s64 %rd8, %rd3, %rd9; + .loc 1 41 36 + and.b32 %r95, %r1, 64; + setp.eq.s32 %p18, %r95, 0; + @%p18 st.global.b32 [ %rd8 + 0 ], { %r57 }; + .loc 1 41 4 + ret; +$L__tmp19: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/sj/csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 266 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 115 +.b8 106 +.b8 100 +.b8 55 +.b8 109 +.b8 108 +.b8 114 +.b8 106 +.b8 117 +.b8 106 +.b8 100 +.b8 52 +.b8 117 +.b8 119 +.b8 122 +.b8 101 +.b8 53 +.b8 116 +.b8 107 +.b8 103 +.b8 55 +.b8 112 +.b8 116 +.b8 116 +.b8 101 +.b8 97 +.b8 103 +.b8 112 +.b8 105 +.b8 104 +.b8 103 +.b8 116 +.b8 53 +.b8 122 +.b8 116 +.b8 97 +.b8 116 +.b8 102 +.b8 113 +.b8 99 +.b8 104 +.b8 112 +.b8 114 +.b8 99 +.b8 114 +.b8 97 +.b8 120 +.b8 50 +.b8 50 +.b8 108 +.b8 115 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 115 +.b8 106 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp18 +.b8 2 +.b8 40 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp17 +.b8 2 +.b8 40 +.b8 25 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp17 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 270 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 270 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..96f5375ad075533186e3f048679c44d977bf40d5 --- /dev/null +++ b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir @@ -0,0 +1,65 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked> + %cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked> + %cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked> + %cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c128_i32 = arith.constant 128 : i32 + %c8_i32 = arith.constant 8 : i32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1> + %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked> + %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1> + %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked> + %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1> + %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked> + %12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked> + %13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked> + %14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked> + %16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %17 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %18 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 { + %25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked> + %26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked> + %27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked> + %28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked> + %29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked> + %31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked> + %32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked> + %34 = tt.load %32, %33, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked> + %35 = arith.extf %34 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> + %36 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %37 = tt.load %36, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked> + %38 = arith.mulf %35, %37 : tensor<64x8xf32, #blocked> + %39 = arith.addf %arg6, %38 : tensor<64x8xf32, #blocked> + %40 = arith.select %33, %39, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> + scf.yield %40 : tensor<64x8xf32, #blocked> + } + %20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %25 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %25 : f32 + }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1> + %23 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> + %24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> + tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1> + tt.return + } +} diff --git a/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..cde3435bfb32afc2845f5eb14e7dbfd3b4ebe0c2 Binary files /dev/null and b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin differ diff --git a/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..7a6024757479d55932884faecaac8e7c2a0102b4 --- /dev/null +++ b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx @@ -0,0 +1,743 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7de8de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3d4d5d6d7de8de( + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_6, + .param .u32 triton__0d1d2d3d4d5d6d7de8de_param_7, + .param .u32 triton__0d1d2d3d4d5d6d7de8de_param_8 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<37>; + .reg .b16 %rs<9>; + .reg .b32 %r<110>; + .reg .f32 %f<86>; + .reg .b64 %rd<26>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8de_param_0]; + ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r76, %tid.x; + and.b32 %r77, %r76, 31; + ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8de_param_2]; + ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8de_param_3]; + ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8de_param_4]; + shl.b32 %r78, %r76, 2; + ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8de_param_5]; + and.b32 %r79, %r78, 252; + ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7de8de_param_6]; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r80, %r1, 8; + .loc 1 30 36 + or.b32 %r81, %r80, %r79; + .loc 1 30 30 + mul.wide.s32 %rd22, %r81, 2; + add.s64 %rd1, %rd16, %rd22; + mov.b32 %r4, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r4; + @!%p1 mov.u32 %r3, %r4; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + .loc 1 30 67 + cvt.f32.bf16 %r6, %rs1; + mov.b32 %f1, %r6; + cvt.f32.bf16 %r7, %rs2; + mov.b32 %f2, %r7; + cvt.f32.bf16 %r8, %rs3; + mov.b32 %f3, %r8; + cvt.f32.bf16 %r9, %rs4; + mov.b32 %f4, %r9; + .loc 1 31 30 + mul.wide.u32 %rd23, %r79, 4; + add.s64 %rd2, %rd17, %rd23; + .loc 1 31 35 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r4; + @!%p1 mov.u32 %r11, %r4; + @!%p1 mov.u32 %r12, %r4; + @!%p1 mov.u32 %r13, %r4; + mov.b32 %f5, %r10; + mov.b32 %f6, %r11; + mov.b32 %f7, %r12; + mov.b32 %f8, %r13; + .loc 1 32 30 + mul.wide.s32 %rd24, %r81, 4; + add.s64 %rd3, %rd18, %rd24; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r4; + @!%p1 mov.u32 %r19, %r4; + @!%p1 mov.u32 %r20, %r4; + @!%p1 mov.u32 %r21, %r4; + mov.b32 %f9, %r18; + mov.b32 %f10, %r19; + mov.b32 %f11, %r20; + mov.b32 %f12, %r21; + .loc 1 33 30 + mul.wide.s32 %rd25, %r1, 4; + add.s64 %rd4, %rd19, %rd25; + .loc 1 33 35 + mov.u32 %r26, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ]; + mov.b32 %f13, %r26; + mov.u32 %r27, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ]; + mov.u32 %r28, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ]; + mov.u32 %r29, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ]; + .loc 1 34 31 + add.s64 %rd8, %rd20, %rd25; + .loc 1 34 36 + mov.u32 %r55, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ]; + mov.b32 %f14, %r55; + mov.u32 %r31, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ]; + mov.u32 %r32, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ]; + mov.u32 %r33, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ]; + .loc 1 35 35 + add.s64 %rd12, %rd15, %rd24; + .loc 1 35 51 + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + mov.u32 %r37, 0x0; + @%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd12 + 0 ]; + @!%p1 mov.u32 %r34, %r4; + @!%p1 mov.u32 %r35, %r4; + @!%p1 mov.u32 %r36, %r4; + @!%p1 mov.u32 %r37, %r4; + mov.b32 %f15, %r34; + mov.b32 %f16, %r35; + mov.b32 %f17, %r36; + mov.b32 %f18, %r37; + .loc 1 37 18 + mul.f32 %f19, %f1, %f5; + mul.f32 %f20, %f2, %f6; + mul.f32 %f21, %f3, %f7; + mul.f32 %f22, %f4, %f8; +$L__tmp1: + .loc 2 233 15 + fma.rn.f32 %f23, %f1, %f5, %f20; + fma.rn.f32 %f24, %f3, %f7, %f23; + fma.rn.f32 %f25, %f4, %f8, %f24; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r82, %f25; + shfl.sync.bfly.b32 %r83, %r82, 16, 31, -1; + mov.b32 %f26, %r83; +$L__tmp3: + .loc 2 233 15 + add.f32 %f27, %f25, %f26; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r84, %f27; + shfl.sync.bfly.b32 %r85, %r84, 8, 31, -1; + mov.b32 %f28, %r85; +$L__tmp5: + .loc 2 233 15 + add.f32 %f29, %f27, %f28; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r86, %f29; + shfl.sync.bfly.b32 %r87, %r86, 4, 31, -1; + mov.b32 %f30, %r87; +$L__tmp7: + .loc 2 233 15 + add.f32 %f31, %f29, %f30; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r88, %f31; + shfl.sync.bfly.b32 %r89, %r88, 2, 31, -1; + mov.b32 %f32, %r89; +$L__tmp9: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r90, %f33; + shfl.sync.bfly.b32 %r91, %r90, 1, 31, -1; + mov.b32 %f34, %r91; +$L__tmp11: + .loc 2 233 15 + add.f32 %f35, %f33, %f34; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p27, %r77, 0; + shr.u32 %r92, %r76, 3; + and.b32 %r93, %r92, 4; + mov.u32 %r94, global_smem; + add.s32 %r42, %r94, %r93; + mov.b32 %r43, %f35; + @%p27 st.shared.b32 [ %r42 + 0 ], %r43; + bar.sync 0; + setp.lt.s32 %p28, %r76, 2; + add.s32 %r45, %r94, %r78; + @%p28 ld.shared.b32 %r44, [ %r45 + 0 ]; + mov.b32 %f36, %r44; + shfl.sync.bfly.b32 %r95, %r44, 1, 31, -1; + mov.b32 %f37, %r95; +$L__tmp13: + .loc 2 233 15 + add.f32 %f38, %f36, %f37; +$L__tmp14: + .loc 2 243 36 + and.b32 %r96, %r76, 1; + setp.eq.b32 %p35, %r96, 1; + not.pred %p36, %p35; + and.pred %p29, %p28, %p36; + mov.b32 %r47, %f38; + @%p29 st.shared.b32 [ %r45 + 0 ], %r47; + bar.sync 0; + ld.shared.f32 %f39, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f40, %f39, 0f00000000; +$L__tmp16: + .loc 1 41 19 + sub.f32 %f41, %f9, %f13; + sub.f32 %f42, %f10, %f13; + sub.f32 %f43, %f11, %f13; + sub.f32 %f44, %f12, %f13; + .loc 1 42 20 + mul.f32 %f45, %f41, %f14; + mul.f32 %f46, %f42, %f14; + mul.f32 %f47, %f43, %f14; + mul.f32 %f48, %f44, %f14; + .loc 1 43 19 + mul.f32 %f49, %f20, %f46; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f50, %f19, %f45, %f49; + fma.rn.f32 %f51, %f21, %f47, %f50; + fma.rn.f32 %f52, %f22, %f48, %f51; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r97, %f52; + shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1; + mov.b32 %f53, %r98; +$L__tmp20: + .loc 2 233 15 + add.f32 %f54, %f52, %f53; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r99, %f54; + shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1; + mov.b32 %f55, %r100; +$L__tmp22: + .loc 2 233 15 + add.f32 %f56, %f54, %f55; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r101, %f56; + shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1; + mov.b32 %f57, %r102; +$L__tmp24: + .loc 2 233 15 + add.f32 %f58, %f56, %f57; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r103, %f58; + shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1; + mov.b32 %f59, %r104; +$L__tmp26: + .loc 2 233 15 + add.f32 %f60, %f58, %f59; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r105, %f60; + shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1; + mov.b32 %f61, %r106; +$L__tmp28: + .loc 2 233 15 + add.f32 %f62, %f60, %f61; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r49, %f62; + @%p27 st.shared.b32 [ %r42 + 0 ], %r49; + bar.sync 0; + @%p28 ld.shared.b32 %r50, [ %r45 + 0 ]; + mov.b32 %f63, %r50; + shfl.sync.bfly.b32 %r107, %r50, 1, 31, -1; + mov.b32 %f64, %r107; +$L__tmp30: + .loc 2 233 15 + add.f32 %f65, %f63, %f64; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r53, %f65; + @%p29 st.shared.b32 [ %r45 + 0 ], %r53; + bar.sync 0; + ld.shared.f32 %f66, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f67, %f66, 0f00000000; + mov.b32 %r56, 1132462080; +$L__tmp33: + .loc 1 48 20 + div.full.f32 %r54, %r55, %r56; + mov.b32 %f68, %r54; + .loc 1 50 20 + neg.f32 %f69, %f40; + fma.rn.f32 %f70, %f19, 0f43800000, %f69; + fma.rn.f32 %f71, %f20, 0f43800000, %f69; + fma.rn.f32 %f72, %f21, 0f43800000, %f69; + fma.rn.f32 %f73, %f22, 0f43800000, %f69; + .loc 1 52 20 + neg.f32 %f74, %f45; + fma.rn.f32 %f75, %f74, %f67, %f70; + neg.f32 %f76, %f46; + fma.rn.f32 %f77, %f76, %f67, %f71; + neg.f32 %f78, %f47; + fma.rn.f32 %f79, %f78, %f67, %f72; + neg.f32 %f80, %f48; + fma.rn.f32 %f81, %f80, %f67, %f73; + .loc 1 54 20 + fma.rn.f32 %f82, %f68, %f75, %f15; + fma.rn.f32 %f83, %f68, %f77, %f16; + fma.rn.f32 %f84, %f68, %f79, %f17; + fma.rn.f32 %f85, %f68, %f81, %f18; + .loc 1 56 51 + mov.b32 %r66, %f82; + mov.b32 %r67, %f83; + mov.b32 %r68, %f84; + mov.b32 %r69, %f85; + @%p1 st.global.v4.b32 [ %rd12 + 0 ], { %r66, %r67, %r68, %r69 }; + .loc 1 57 25 + add.s64 %rd14, %rd21, %rd22; + .loc 1 57 48 + cvt.rn.bf16.f32 %rs5, %r66; + cvt.rn.bf16.f32 %rs6, %r67; + cvt.rn.bf16.f32 %rs7, %r68; + cvt.rn.bf16.f32 %rs8, %r69; + mov.b32 %r108, {%rs5, %rs6}; + mov.b32 %r109, {%rs7, %rs8}; + @%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r108, %r109 }; + .loc 1 57 4 + ret; +$L__tmp34: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/sn/csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 403 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 115 +.b8 110 +.b8 101 +.b8 100 +.b8 52 +.b8 104 +.b8 121 +.b8 120 +.b8 112 +.b8 103 +.b8 119 +.b8 117 +.b8 53 +.b8 116 +.b8 116 +.b8 117 +.b8 98 +.b8 115 +.b8 51 +.b8 114 +.b8 55 +.b8 117 +.b8 120 +.b8 107 +.b8 106 +.b8 113 +.b8 53 +.b8 121 +.b8 102 +.b8 108 +.b8 51 +.b8 122 +.b8 104 +.b8 54 +.b8 99 +.b8 50 +.b8 115 +.b8 111 +.b8 122 +.b8 111 +.b8 98 +.b8 116 +.b8 107 +.b8 101 +.b8 107 +.b8 50 +.b8 117 +.b8 122 +.b8 102 +.b8 99 +.b8 118 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 115 +.b8 110 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 40 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 40 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 40 +.b8 44 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 46 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 46 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 46 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 407 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 407 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0f80420eebcc22a5a99ebb4cc6417788c7064252 --- /dev/null +++ b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir @@ -0,0 +1,72 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_2 = arith.constant dense<2.560000e+02> : tensor<256xf32> + %cst_3 = arith.constant dense<2.560000e+02> : tensor<1xf32> + %cst_4 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %9 = arith.extf %8 : tensor<256xbf16> to tensor<256xf32> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %12 = tt.load %11, %2, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %16 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %17 = tt.splat %16 : (!tt.ptr) -> tensor<1x!tt.ptr> + %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %19 = tt.addptr %arg5, %0 : !tt.ptr, i32 + %20 = tt.splat %19 : (!tt.ptr) -> tensor<1x!tt.ptr> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %22 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %23 = tt.addptr %22, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %24 = tt.load %23, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %25 = arith.mulf %9, %12 : tensor<256xf32> + %26 = arith.select %2, %25, %cst_1 : tensor<256xi1>, tensor<256xf32> + %27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %50 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %50 : f32 + }) : (tensor<256xf32>) -> f32 + %28 = arith.addf %27, %cst_0 : f32 + %29 = tt.broadcast %18 : (tensor<1xf32>) -> tensor<256xf32> + %30 = arith.subf %15, %29 : tensor<256xf32> + %31 = tt.broadcast %21 : (tensor<1xf32>) -> tensor<256xf32> + %32 = arith.mulf %30, %31 : tensor<256xf32> + %33 = arith.mulf %25, %32 : tensor<256xf32> + %34 = arith.select %2, %33, %cst_1 : tensor<256xi1>, tensor<256xf32> + %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %50 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %50 : f32 + }) : (tensor<256xf32>) -> f32 + %36 = arith.addf %35, %cst_0 : f32 + %37 = arith.divf %21, %cst_3 : tensor<1xf32> + %38 = arith.mulf %25, %cst_2 : tensor<256xf32> + %39 = tt.splat %28 : (f32) -> tensor<256xf32> + %40 = arith.subf %38, %39 : tensor<256xf32> + %41 = tt.splat %36 : (f32) -> tensor<256xf32> + %42 = arith.mulf %32, %41 : tensor<256xf32> + %43 = arith.subf %40, %42 : tensor<256xf32> + %44 = tt.broadcast %37 : (tensor<1xf32>) -> tensor<256xf32> + %45 = arith.mulf %44, %43 : tensor<256xf32> + %46 = arith.addf %24, %45 : tensor<256xf32> + tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %47 = tt.splat %arg6 : (!tt.ptr) -> tensor<256x!tt.ptr> + %48 = tt.addptr %47, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %49 = arith.truncf %46 : tensor<256xf32> to tensor<256xbf16> + tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16> + tt.return + } +} diff --git a/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..ee5cb4d94b8ddb9d24c82aa0b55069d117ccb35f Binary files /dev/null and b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin differ diff --git a/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..56acceca79e636922fce5162d7e68c9afe3f1b30 --- /dev/null +++ b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir @@ -0,0 +1,132 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = and i32 %6, 63, !dbg !8 + %8 = lshr i32 %6, 6, !dbg !9 + %9 = and i32 %8, 3, !dbg !9 + %10 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 + %11 = shl i32 %10, 6, !dbg !11 + %12 = or i32 %11, %7, !dbg !12 + br label %13, !dbg !13 + +13: ; preds = %5, %13 + %14 = phi float [ 0.000000e+00, %5 ], [ %23, %13 ] + %15 = phi i32 [ 0, %5 ], [ %24, %13 ] + %16 = or i32 %15, %9, !dbg !14 + %17 = shl i32 %16, 17, !dbg !15 + %18 = add i32 %17, %12, !dbg !16 + %19 = sext i32 %18 to i64, !dbg !17 + %20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !17 + %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true) #3, !dbg !18 + %22 = bitcast i32 %21 to float, !dbg !18 + %23 = fadd float %14, %22, !dbg !19 + %24 = add nuw nsw i32 %15, 4, !dbg !13 + %25 = icmp ult i32 %15, 116, !dbg !13 + br i1 %25, label %13, label %26, !dbg !13 + +26: ; preds = %13 + %27 = shl nuw nsw i32 %7, 2, !dbg !20 + %28 = or i32 %27, %9, !dbg !20 + %29 = zext nneg i32 %28 to i64, !dbg !20 + %30 = getelementptr float, ptr addrspace(3) @global_smem, i64 %29, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %30, float %23, i1 true) #3, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !20 + %31 = icmp slt i32 %6, 256, !dbg !20 + %32 = sext i32 %6 to i64, !dbg !20 + %33 = getelementptr float, ptr addrspace(3) @global_smem, i64 %32, !dbg !20 + %34 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %33, i1 %31) #3, !dbg !20 + %35 = bitcast float %34 to i32, !dbg !20 + %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !20 + %37 = bitcast i32 %36 to float, !dbg !20 + %38 = fadd float %34, %37, !dbg !24 + %39 = bitcast float %38 to i32, !dbg !20 + %40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !20 + %41 = bitcast i32 %40 to float, !dbg !20 + %42 = fadd float %38, %41, !dbg !24 + %43 = and i32 %6, 3, !dbg !20 + %44 = icmp eq i32 %43, 0, !dbg !20 + %45 = and i1 %31, %44, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %33, float %42, i1 %45) #3, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !20 + %46 = zext nneg i32 %27 to i64, !dbg !20 + %47 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46, !dbg !20 + %48 = load float, ptr addrspace(3) %47, align 4, !dbg !20 + %.frozen = freeze i32 %12 + %49 = sdiv i32 %.frozen, 256, !dbg !28 + %50 = mul i32 %49, 256 + %.decomposed = sub i32 %.frozen, %50 + %51 = sext i32 %49 to i64, !dbg !29 + %52 = getelementptr i64, ptr addrspace(1) %1, i64 %51, !dbg !29 + %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %52, i1 true) #3, !dbg !30 + %54 = lshr i64 %53, 54, !dbg !31 + %55 = and i64 %54, 512, !dbg !31 + %56 = add i64 %55, %53, !dbg !31 + %57 = shl i64 %56, 8, !dbg !32 + %58 = sext i32 %.decomposed to i64, !dbg !33 + %59 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !34 + %60 = getelementptr float, ptr addrspace(1) %59, i64 %58, !dbg !34 + %61 = icmp eq i32 %9, 0, !dbg !35 + %62 = insertelement <1 x float> undef, float %48, i64 0, !dbg !35 + %63 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %60, <1 x float> %62, i1 %61) #3, !dbg !35 + ret void, !dbg !36 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i") +!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 21, column: 28, scope: !5) +!11 = !DILocation(line: 21, column: 33, scope: !5) +!12 = !DILocation(line: 22, column: 23, scope: !5) +!13 = !DILocation(line: 27, column: 36, scope: !5) +!14 = !DILocation(line: 28, column: 27, scope: !5) +!15 = !DILocation(line: 31, column: 47, scope: !5) +!16 = !DILocation(line: 31, column: 40, scope: !5) +!17 = !DILocation(line: 31, column: 34, scope: !5) +!18 = !DILocation(line: 31, column: 53, scope: !5) +!19 = !DILocation(line: 34, column: 38, scope: !5) +!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0) +!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!23 = !DILocation(line: 35, column: 25, scope: !21) +!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26) +!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0) +!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27) +!27 = !DILocation(line: 35, column: 25, scope: !25) +!28 = !DILocation(line: 36, column: 20, scope: !5) +!29 = !DILocation(line: 38, column: 30, scope: !5) +!30 = !DILocation(line: 38, column: 35, scope: !5) +!31 = !DILocation(line: 41, column: 32, scope: !5) +!32 = !DILocation(line: 45, column: 40, scope: !5) +!33 = !DILocation(line: 45, column: 36, scope: !5) +!34 = !DILocation(line: 45, column: 30, scope: !5) +!35 = !DILocation(line: 45, column: 55, scope: !5) +!36 = !DILocation(line: 45, column: 4, scope: !5) diff --git a/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..09772a987690dd4145d83a5a48414fad98be08dc --- /dev/null +++ b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir @@ -0,0 +1,304 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %9 = and i32 %8, 31, !dbg !10 + %10 = lshr i32 %8, 5, !dbg !10 + %11 = and i32 %10, 1, !dbg !10 + %urem = shl i32 %8, 2, !dbg !10 + %12 = and i32 %urem, 252, !dbg !10 + %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11 + %14 = shl i32 %13, 8, !dbg !12 + %15 = or i32 %14, %12, !dbg !13 + %16 = sext i32 %15 to i64, !dbg !14 + %17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !14 + %18 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %17, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15 + %19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !15 + %20 = extractvalue { i32, i32, i32, i32 } %18, 1, !dbg !15 + %21 = extractvalue { i32, i32, i32, i32 } %18, 2, !dbg !15 + %22 = extractvalue { i32, i32, i32, i32 } %18, 3, !dbg !15 + %23 = bitcast i32 %21 to float, !dbg !15 + %24 = bitcast i32 %22 to float, !dbg !15 + %25 = getelementptr i16, ptr addrspace(1) %1, i64 %16, !dbg !16 + %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17 + %27 = extractvalue { i32, i32 } %26, 0, !dbg !17 + %28 = extractvalue { i32, i32 } %26, 1, !dbg !17 + %29 = trunc i32 %27 to i16, !dbg !17 + %extelt.offset = lshr i32 %27, 16, !dbg !17 + %30 = trunc i32 %extelt.offset to i16, !dbg !17 + %31 = trunc i32 %28 to i16, !dbg !17 + %extelt.offset1 = lshr i32 %28, 16, !dbg !17 + %32 = trunc i32 %extelt.offset1 to i16, !dbg !17 + %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #6, !dbg !18 + %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18 + %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18 + %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18 + %37 = getelementptr i16, ptr addrspace(1) %2, i64 %16, !dbg !19 + %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20 + %39 = extractvalue { i32, i32 } %38, 0, !dbg !20 + %40 = extractvalue { i32, i32 } %38, 1, !dbg !20 + %41 = trunc i32 %39 to i16, !dbg !20 + %extelt.offset2 = lshr i32 %39, 16, !dbg !20 + %42 = trunc i32 %extelt.offset2 to i16, !dbg !20 + %43 = trunc i32 %40 to i16, !dbg !20 + %extelt.offset3 = lshr i32 %40, 16, !dbg !20 + %44 = trunc i32 %extelt.offset3 to i16, !dbg !20 + %45 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #6, !dbg !21 + %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21 + %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21 + %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21 + %49 = zext nneg i32 %12 to i64, !dbg !22 + %50 = getelementptr float, ptr addrspace(1) %3, i64 %49, !dbg !22 + %51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23 + %52 = fadd float %35, %23, !dbg !24 + %53 = fadd float %36, %24, !dbg !24 + %54 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !15 + %55 = insertelement <2 x i32> %54, i32 %20, i64 1, !dbg !15 + %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !15 + %57 = insertelement <2 x float> poison, float %33, i64 0, !dbg !24 + %58 = insertelement <2 x float> %57, float %34, i64 1, !dbg !24 + %59 = fadd <2 x float> %58, %56, !dbg !24 + %60 = insertelement <2 x float> poison, float %45, i64 0, !dbg !25 + %61 = insertelement <2 x float> %60, float %46, i64 1, !dbg !25 + %62 = fadd <2 x float> %59, %61, !dbg !25 + %63 = fadd float %52, %47, !dbg !25 + %64 = fadd float %53, %48, !dbg !25 + %65 = extractelement <2 x float> %62, i64 0, !dbg !26 + %66 = extractelement <2 x float> %62, i64 1, !dbg !26 + %67 = fadd float %65, %66, !dbg !26 + %68 = fadd float %67, %63, !dbg !26 + %69 = fadd float %68, %64, !dbg !26 + %70 = bitcast float %69 to i32, !dbg !32 + %71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32 + %72 = bitcast i32 %71 to float, !dbg !32 + %73 = fadd float %69, %72, !dbg !26 + %74 = bitcast float %73 to i32, !dbg !32 + %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32 + %76 = bitcast i32 %75 to float, !dbg !32 + %77 = fadd float %73, %76, !dbg !26 + %78 = bitcast float %77 to i32, !dbg !32 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32 + %80 = bitcast i32 %79 to float, !dbg !32 + %81 = fadd float %77, %80, !dbg !26 + %82 = bitcast float %81 to i32, !dbg !32 + %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32 + %84 = bitcast i32 %83 to float, !dbg !32 + %85 = fadd float %81, %84, !dbg !26 + %86 = bitcast float %85 to i32, !dbg !32 + %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32 + %88 = bitcast i32 %87 to float, !dbg !32 + %89 = fadd float %85, %88, !dbg !26 + %90 = icmp eq i32 %9, 0, !dbg !32 + %91 = zext nneg i32 %11 to i64, !dbg !32 + %92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #6, !dbg !32 + tail call void @llvm.nvvm.barrier0(), !dbg !32 + %93 = icmp slt i32 %8, 2, !dbg !32 + %94 = sext i32 %8 to i64, !dbg !32 + %95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32 + %96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !32 + %97 = bitcast float %96 to i32, !dbg !32 + %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 1, i32 31), !dbg !32 + %99 = bitcast i32 %98 to float, !dbg !32 + %100 = fadd float %96, %99, !dbg !26 + %101 = and i32 %8, 1, !dbg !32 + %102 = icmp eq i32 %101, 0, !dbg !32 + %103 = and i1 %93, %102, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %100, i1 %103) #6, !dbg !32 + tail call void @llvm.nvvm.barrier0(), !dbg !32 + %104 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32 + %105 = fadd float %104, 0.000000e+00, !dbg !34 + %106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %105, float 2.560000e+02) #6, !dbg !38 + %107 = fsub float %65, %106, !dbg !39 + %108 = fsub float %66, %106, !dbg !39 + %109 = fsub float %63, %106, !dbg !39 + %110 = fsub float %64, %106, !dbg !39 + %111 = fmul float %107, %107, !dbg !40 + %112 = fmul float %108, %108, !dbg !40 + %113 = fmul float %109, %109, !dbg !40 + %114 = fmul float %110, %110, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %115 = fadd float %111, %112, !dbg !43 + %116 = fadd float %113, %115, !dbg !43 + %117 = fadd float %114, %116, !dbg !43 + %118 = bitcast float %117 to i32, !dbg !41 + %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 16, i32 31), !dbg !41 + %120 = bitcast i32 %119 to float, !dbg !41 + %121 = fadd float %117, %120, !dbg !43 + %122 = bitcast float %121 to i32, !dbg !41 + %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 8, i32 31), !dbg !41 + %124 = bitcast i32 %123 to float, !dbg !41 + %125 = fadd float %121, %124, !dbg !43 + %126 = bitcast float %125 to i32, !dbg !41 + %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 4, i32 31), !dbg !41 + %128 = bitcast i32 %127 to float, !dbg !41 + %129 = fadd float %125, %128, !dbg !43 + %130 = bitcast float %129 to i32, !dbg !41 + %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !41 + %132 = bitcast i32 %131 to float, !dbg !41 + %133 = fadd float %129, %132, !dbg !43 + %134 = bitcast float %133 to i32, !dbg !41 + %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 1, i32 31), !dbg !41 + %136 = bitcast i32 %135 to float, !dbg !41 + %137 = fadd float %133, %136, !dbg !43 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %137, i1 %90) #6, !dbg !41 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %138 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !41 + %139 = bitcast float %138 to i32, !dbg !41 + %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !41 + %141 = bitcast i32 %140 to float, !dbg !41 + %142 = fadd float %138, %141, !dbg !43 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %142, i1 %103) #6, !dbg !41 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41 + %144 = fadd float %143, 0.000000e+00, !dbg !46 + %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !48 + %146 = fadd float %145, 0x3EE4F8B580000000, !dbg !49 + %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50 + %.not.i = icmp eq i32 %147, 0, !dbg !50 + br i1 %.not.i, label %150, label %148, !dbg !50 + +148: ; preds = %7 + %149 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %146), !dbg !50 + br label %__nv_rsqrtf.exit, !dbg !50 + +150: ; preds = %7 + %151 = tail call float @llvm.nvvm.rsqrt.approx.f(float %146), !dbg !50 + br label %__nv_rsqrtf.exit, !dbg !50 + +__nv_rsqrtf.exit: ; preds = %148, %150 + %.0.i = phi float [ %149, %148 ], [ %151, %150 ], !dbg !50 + %152 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !23 + %153 = bitcast i32 %152 to float, !dbg !23 + %154 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !23 + %155 = bitcast i32 %154 to float, !dbg !23 + %156 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !23 + %157 = bitcast i32 %156 to float, !dbg !23 + %158 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !23 + %159 = bitcast i32 %158 to float, !dbg !23 + %160 = fmul float %107, %.0.i, !dbg !51 + %161 = fmul float %108, %.0.i, !dbg !51 + %162 = fmul float %109, %.0.i, !dbg !51 + %163 = fmul float %110, %.0.i, !dbg !51 + %164 = fmul float %160, %159, !dbg !52 + %165 = fmul float %161, %157, !dbg !52 + %166 = fmul float %162, %155, !dbg !52 + %167 = fmul float %163, %153, !dbg !52 + %168 = getelementptr float, ptr addrspace(1) %4, i64 %16, !dbg !53 + %169 = bitcast float %164 to i32, !dbg !54 + %170 = bitcast float %165 to i32, !dbg !54 + %171 = bitcast float %166 to i32, !dbg !54 + %172 = bitcast float %167 to i32, !dbg !54 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %169, i32 %170, i32 %171, i32 %172, ptr addrspace(1) %168, i1 true) #6, !dbg !54 + ret void, !dbg !55 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { + %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 + %.not = icmp eq i32 %1, 0 + br i1 %.not, label %4, label %2 + +2: ; preds = %0 + %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) + br label %6 + +4: ; preds = %0 + %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) + br label %6 + +6: ; preds = %4, %2 + %.0 = phi float [ %3, %2 ], [ %5, %4 ] + ret float %.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #5 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "cpedrbcgvftrmo3x6vfpo6dhkxbweq3ucfj5jibyyvr3hf67gsvx.py", directory: "/tmp/torchinductor_root/pe") +!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 26, column: 26, scope: !7) +!11 = !DILocation(line: 23, column: 28, scope: !7) +!12 = !DILocation(line: 30, column: 40, scope: !7) +!13 = !DILocation(line: 30, column: 36, scope: !7) +!14 = !DILocation(line: 30, column: 30, scope: !7) +!15 = !DILocation(line: 30, column: 46, scope: !7) +!16 = !DILocation(line: 31, column: 30, scope: !7) +!17 = !DILocation(line: 31, column: 46, scope: !7) +!18 = !DILocation(line: 31, column: 67, scope: !7) +!19 = !DILocation(line: 32, column: 30, scope: !7) +!20 = !DILocation(line: 32, column: 46, scope: !7) +!21 = !DILocation(line: 32, column: 67, scope: !7) +!22 = !DILocation(line: 33, column: 31, scope: !7) +!23 = !DILocation(line: 33, column: 36, scope: !7) +!24 = !DILocation(line: 35, column: 18, scope: !7) +!25 = !DILocation(line: 37, column: 18, scope: !7) +!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30) +!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0) +!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0) +!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31) +!31 = !DILocation(line: 42, column: 59, scope: !27) +!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33) +!33 = !DILocation(line: 42, column: 59, scope: !29) +!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37) +!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0) +!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!37 = !DILocation(line: 42, column: 45, scope: !35) +!38 = !DILocation(line: 45, column: 20, scope: !7) +!39 = !DILocation(line: 46, column: 19, scope: !7) +!40 = !DILocation(line: 47, column: 20, scope: !7) +!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42) +!42 = !DILocation(line: 50, column: 59, scope: !29) +!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44) +!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45) +!45 = !DILocation(line: 50, column: 59, scope: !27) +!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47) +!47 = !DILocation(line: 50, column: 45, scope: !35) +!48 = !DILocation(line: 53, column: 20, scope: !7) +!49 = !DILocation(line: 55, column: 20, scope: !7) +!50 = !DILocation(line: 56, column: 26, scope: !7) +!51 = !DILocation(line: 57, column: 20, scope: !7) +!52 = !DILocation(line: 58, column: 20, scope: !7) +!53 = !DILocation(line: 59, column: 25, scope: !7) +!54 = !DILocation(line: 59, column: 48, scope: !7) +!55 = !DILocation(line: 59, column: 4, scope: !7) diff --git a/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..7f2001fc894b13ca58dbe11761174d70d18d7130 --- /dev/null +++ b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir @@ -0,0 +1,62 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant 9.99999974E-6 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %17 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %20 = arith.addf %8, %12 : tensor<256xf32, #blocked> + %21 = arith.addf %20, %16 : tensor<256xf32, #blocked> + %22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32): + %40 = arith.addf %arg7, %arg8 : f32 + tt.reduce.return %40 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %24 = arith.addf %23, %cst_2 : f32 + %25 = arith.divf %24, %cst_1 : f32 + %26 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked> + %27 = arith.subf %21, %26 : tensor<256xf32, #blocked> + %28 = arith.mulf %27, %27 : tensor<256xf32, #blocked> + %29 = arith.select %2, %28, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32): + %40 = arith.addf %arg7, %arg8 : f32 + tt.reduce.return %40 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %31 = arith.addf %30, %cst_2 : f32 + %32 = arith.divf %31, %cst_1 : f32 + %33 = arith.addf %32, %cst_0 : f32 + %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %35 = tt.splat %34 : (f32) -> tensor<256xf32, #blocked> + %36 = arith.mulf %27, %35 : tensor<256xf32, #blocked> + %37 = arith.mulf %36, %19 : tensor<256xf32, #blocked> + %38 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..8368d08e617d5afffe52b03226bb83f89c3b425c --- /dev/null +++ b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir @@ -0,0 +1,61 @@ +module { + tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 9.99999974E-6 : f32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_4 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32> + %17 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %20 = arith.addf %8, %12 : tensor<256xf32> + %21 = arith.addf %20, %16 : tensor<256xf32> + %22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32> + %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32): + %40 = arith.addf %arg7, %arg8 : f32 + tt.reduce.return %40 : f32 + }) : (tensor<256xf32>) -> f32 + %24 = arith.addf %23, %cst_0 : f32 + %25 = arith.divf %24, %cst_1 : f32 + %26 = tt.splat %25 : (f32) -> tensor<256xf32> + %27 = arith.subf %21, %26 : tensor<256xf32> + %28 = arith.mulf %27, %27 : tensor<256xf32> + %29 = arith.select %2, %28, %cst_3 : tensor<256xi1>, tensor<256xf32> + %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32): + %40 = arith.addf %arg7, %arg8 : f32 + tt.reduce.return %40 : f32 + }) : (tensor<256xf32>) -> f32 + %31 = arith.addf %30, %cst_0 : f32 + %32 = arith.divf %31, %cst_1 : f32 + %33 = arith.addf %32, %cst_2 : f32 + %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %35 = tt.splat %34 : (f32) -> tensor<256xf32> + %36 = arith.mulf %27, %35 : tensor<256xf32> + %37 = arith.mulf %36, %19 : tensor<256xf32> + %38 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr> + %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + tt.return + } +} diff --git a/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.llir b/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..9917c6d4a078a983b8356f1ab30ba05a8ca60554 --- /dev/null +++ b/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.llir @@ -0,0 +1,213 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +define void @triton__0d1d2d3d4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = shl i32 %6, 3, !dbg !8 + %8 = and i32 %7, 1016, !dbg !8 + %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9 + %10 = shl i32 %9, 10, !dbg !10 + %11 = or i32 %10, %8, !dbg !11 + %.frozen = freeze i32 %11 + %12 = sdiv i32 %.frozen, 256, !dbg !12 + %13 = srem i32 %12, 3, !dbg !13 + %14 = mul i32 %12, 256 + %.decomposed = sub i32 %.frozen, %14 + %15 = sdiv i32 %11, 768, !dbg !14 + %16 = shl nsw i32 %15, 8, !dbg !15 + %17 = add nsw i32 %16, %.decomposed, !dbg !16 + %18 = sext i32 %17 to i64, !dbg !17 + %19 = getelementptr i16, ptr addrspace(1) %0, i64 %18, !dbg !17 + %20 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %19, i1 true) #1, !dbg !18 + %21 = extractvalue { i32, i32, i32, i32 } %20, 0, !dbg !18 + %22 = extractvalue { i32, i32, i32, i32 } %20, 1, !dbg !18 + %23 = extractvalue { i32, i32, i32, i32 } %20, 2, !dbg !18 + %24 = extractvalue { i32, i32, i32, i32 } %20, 3, !dbg !18 + %25 = trunc i32 %21 to i16, !dbg !18 + %extelt.offset = lshr i32 %21, 16, !dbg !18 + %26 = trunc i32 %extelt.offset to i16, !dbg !18 + %27 = trunc i32 %22 to i16, !dbg !18 + %extelt.offset1 = lshr i32 %22, 16, !dbg !18 + %28 = trunc i32 %extelt.offset1 to i16, !dbg !18 + %29 = trunc i32 %23 to i16, !dbg !18 + %extelt.offset2 = lshr i32 %23, 16, !dbg !18 + %30 = trunc i32 %extelt.offset2 to i16, !dbg !18 + %31 = trunc i32 %24 to i16, !dbg !18 + %extelt.offset3 = lshr i32 %24, 16, !dbg !18 + %32 = trunc i32 %extelt.offset3 to i16, !dbg !18 + %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #1, !dbg !19 + %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #1, !dbg !19 + %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #1, !dbg !19 + %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %28) #1, !dbg !19 + %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #1, !dbg !19 + %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #1, !dbg !19 + %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #1, !dbg !19 + %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #1, !dbg !19 + %41 = getelementptr i16, ptr addrspace(1) %1, i64 %18, !dbg !20 + %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %41, i1 true) #1, !dbg !21 + %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !21 + %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !21 + %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !21 + %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !21 + %47 = trunc i32 %43 to i16, !dbg !21 + %extelt.offset4 = lshr i32 %43, 16, !dbg !21 + %48 = trunc i32 %extelt.offset4 to i16, !dbg !21 + %49 = trunc i32 %44 to i16, !dbg !21 + %extelt.offset5 = lshr i32 %44, 16, !dbg !21 + %50 = trunc i32 %extelt.offset5 to i16, !dbg !21 + %51 = trunc i32 %45 to i16, !dbg !21 + %extelt.offset6 = lshr i32 %45, 16, !dbg !21 + %52 = trunc i32 %extelt.offset6 to i16, !dbg !21 + %53 = trunc i32 %46 to i16, !dbg !21 + %extelt.offset7 = lshr i32 %46, 16, !dbg !21 + %54 = trunc i32 %extelt.offset7 to i16, !dbg !21 + %55 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #1, !dbg !22 + %56 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #1, !dbg !22 + %57 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #1, !dbg !22 + %58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %50) #1, !dbg !22 + %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #1, !dbg !22 + %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %52) #1, !dbg !22 + %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %53) #1, !dbg !22 + %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #1, !dbg !22 + %63 = getelementptr i16, ptr addrspace(1) %2, i64 %18, !dbg !23 + %64 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %63, i1 true) #1, !dbg !24 + %65 = extractvalue { i32, i32, i32, i32 } %64, 0, !dbg !24 + %66 = extractvalue { i32, i32, i32, i32 } %64, 1, !dbg !24 + %67 = extractvalue { i32, i32, i32, i32 } %64, 2, !dbg !24 + %68 = extractvalue { i32, i32, i32, i32 } %64, 3, !dbg !24 + %69 = trunc i32 %65 to i16, !dbg !24 + %extelt.offset8 = lshr i32 %65, 16, !dbg !24 + %70 = trunc i32 %extelt.offset8 to i16, !dbg !24 + %71 = trunc i32 %66 to i16, !dbg !24 + %extelt.offset9 = lshr i32 %66, 16, !dbg !24 + %72 = trunc i32 %extelt.offset9 to i16, !dbg !24 + %73 = trunc i32 %67 to i16, !dbg !24 + %extelt.offset10 = lshr i32 %67, 16, !dbg !24 + %74 = trunc i32 %extelt.offset10 to i16, !dbg !24 + %75 = trunc i32 %68 to i16, !dbg !24 + %extelt.offset11 = lshr i32 %68, 16, !dbg !24 + %76 = trunc i32 %extelt.offset11 to i16, !dbg !24 + %77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %69) #1, !dbg !25 + %78 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #1, !dbg !25 + %79 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #1, !dbg !25 + %80 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #1, !dbg !25 + %81 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #1, !dbg !25 + %82 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #1, !dbg !25 + %83 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %75) #1, !dbg !25 + %84 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %76) #1, !dbg !25 + %85 = icmp eq i32 %13, 2, !dbg !26 + %86 = select i1 %85, float %33, float 0.000000e+00, !dbg !27 + %87 = select i1 %85, float %34, float 0.000000e+00, !dbg !27 + %88 = select i1 %85, float %35, float 0.000000e+00, !dbg !27 + %89 = select i1 %85, float %36, float 0.000000e+00, !dbg !27 + %90 = select i1 %85, float %37, float 0.000000e+00, !dbg !27 + %91 = select i1 %85, float %38, float 0.000000e+00, !dbg !27 + %92 = select i1 %85, float %39, float 0.000000e+00, !dbg !27 + %93 = select i1 %85, float %40, float 0.000000e+00, !dbg !27 + %94 = icmp eq i32 %13, 1, !dbg !28 + %95 = select i1 %94, float %55, float 0.000000e+00, !dbg !29 + %96 = select i1 %94, float %56, float 0.000000e+00, !dbg !29 + %97 = select i1 %94, float %57, float 0.000000e+00, !dbg !29 + %98 = select i1 %94, float %58, float 0.000000e+00, !dbg !29 + %99 = select i1 %94, float %59, float 0.000000e+00, !dbg !29 + %100 = select i1 %94, float %60, float 0.000000e+00, !dbg !29 + %101 = select i1 %94, float %61, float 0.000000e+00, !dbg !29 + %102 = select i1 %94, float %62, float 0.000000e+00, !dbg !29 + %103 = fadd float %86, %95, !dbg !30 + %104 = fadd float %87, %96, !dbg !30 + %105 = fadd float %88, %97, !dbg !30 + %106 = fadd float %89, %98, !dbg !30 + %107 = fadd float %90, %99, !dbg !30 + %108 = fadd float %91, %100, !dbg !30 + %109 = fadd float %92, %101, !dbg !30 + %110 = fadd float %93, %102, !dbg !30 + %111 = icmp eq i32 %13, 0, !dbg !31 + %112 = select i1 %111, float %77, float 0.000000e+00, !dbg !32 + %113 = select i1 %111, float %78, float 0.000000e+00, !dbg !32 + %114 = select i1 %111, float %79, float 0.000000e+00, !dbg !32 + %115 = select i1 %111, float %80, float 0.000000e+00, !dbg !32 + %116 = select i1 %111, float %81, float 0.000000e+00, !dbg !32 + %117 = select i1 %111, float %82, float 0.000000e+00, !dbg !32 + %118 = select i1 %111, float %83, float 0.000000e+00, !dbg !32 + %119 = select i1 %111, float %84, float 0.000000e+00, !dbg !32 + %120 = fadd float %103, %112, !dbg !33 + %121 = fadd float %104, %113, !dbg !33 + %122 = fadd float %105, %114, !dbg !33 + %123 = fadd float %106, %115, !dbg !33 + %124 = fadd float %107, %116, !dbg !33 + %125 = fadd float %108, %117, !dbg !33 + %126 = fadd float %109, %118, !dbg !33 + %127 = fadd float %110, %119, !dbg !33 + %128 = sext i32 %11 to i64, !dbg !34 + %129 = getelementptr i16, ptr addrspace(1) %3, i64 %128, !dbg !34 + %130 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %120) #1, !dbg !35 + %131 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %121) #1, !dbg !35 + %132 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %122) #1, !dbg !35 + %133 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %123) #1, !dbg !35 + %134 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %124) #1, !dbg !35 + %135 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %125) #1, !dbg !35 + %136 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %126) #1, !dbg !35 + %137 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %127) #1, !dbg !35 + %138 = insertelement <2 x i16> undef, i16 %130, i64 0, !dbg !35 + %139 = insertelement <2 x i16> %138, i16 %131, i64 1, !dbg !35 + %140 = bitcast <2 x i16> %139 to i32, !dbg !35 + %141 = insertelement <2 x i16> undef, i16 %132, i64 0, !dbg !35 + %142 = insertelement <2 x i16> %141, i16 %133, i64 1, !dbg !35 + %143 = bitcast <2 x i16> %142 to i32, !dbg !35 + %144 = insertelement <2 x i16> undef, i16 %134, i64 0, !dbg !35 + %145 = insertelement <2 x i16> %144, i16 %135, i64 1, !dbg !35 + %146 = bitcast <2 x i16> %145 to i32, !dbg !35 + %147 = insertelement <2 x i16> undef, i16 %136, i64 0, !dbg !35 + %148 = insertelement <2 x i16> %147, i16 %137, i64 1, !dbg !35 + %149 = bitcast <2 x i16> %148 to i32, !dbg !35 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %140, i32 %143, i32 %146, i32 %149, ptr addrspace(1) %129, i1 true) #1, !dbg !35 + ret void, !dbg !36 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py", directory: "/tmp/torchinductor_root/63") +!3 = !{ptr @triton__0d1d2d3d4de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4de, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4de", linkageName: "triton__0d1d2d3d4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 36, scope: !5) +!9 = !DILocation(line: 20, column: 28, scope: !5) +!10 = !DILocation(line: 20, column: 33, scope: !5) +!11 = !DILocation(line: 21, column: 23, scope: !5) +!12 = !DILocation(line: 23, column: 20, scope: !5) +!13 = !DILocation(line: 23, column: 27, scope: !5) +!14 = !DILocation(line: 25, column: 20, scope: !5) +!15 = !DILocation(line: 27, column: 40, scope: !5) +!16 = !DILocation(line: 27, column: 36, scope: !5) +!17 = !DILocation(line: 27, column: 30, scope: !5) +!18 = !DILocation(line: 27, column: 46, scope: !5) +!19 = !DILocation(line: 27, column: 85, scope: !5) +!20 = !DILocation(line: 28, column: 30, scope: !5) +!21 = !DILocation(line: 28, column: 46, scope: !5) +!22 = !DILocation(line: 28, column: 85, scope: !5) +!23 = !DILocation(line: 29, column: 31, scope: !5) +!24 = !DILocation(line: 29, column: 47, scope: !5) +!25 = !DILocation(line: 29, column: 86, scope: !5) +!26 = !DILocation(line: 32, column: 19, scope: !5) +!27 = !DILocation(line: 34, column: 32, scope: !5) +!28 = !DILocation(line: 36, column: 19, scope: !5) +!29 = !DILocation(line: 37, column: 32, scope: !5) +!30 = !DILocation(line: 38, column: 19, scope: !5) +!31 = !DILocation(line: 40, column: 20, scope: !5) +!32 = !DILocation(line: 41, column: 35, scope: !5) +!33 = !DILocation(line: 42, column: 20, scope: !5) +!34 = !DILocation(line: 43, column: 25, scope: !5) +!35 = !DILocation(line: 43, column: 37, scope: !5) +!36 = !DILocation(line: 43, column: 4, scope: !5) diff --git a/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx b/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..692e6b79c189bcbf821a65cb631a78fd5d621abb --- /dev/null +++ b/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx @@ -0,0 +1,495 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4de + +.visible .entry triton__0d1d2d3d4de( + .param .u64 triton__0d1d2d3d4de_param_0, + .param .u64 triton__0d1d2d3d4de_param_1, + .param .u64 triton__0d1d2d3d4de_param_2, + .param .u64 triton__0d1d2d3d4de_param_3, + .param .u32 triton__0d1d2d3d4de_param_4 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<8>; + .reg .b16 %rs<33>; + .reg .b32 %r<77>; + .reg .f32 %f<65>; + .reg .b64 %rd<11>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd5, [triton__0d1d2d3d4de_param_0]; + ld.param.u64 %rd6, [triton__0d1d2d3d4de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r50, %tid.x; + shl.b32 %r51, %r50, 3; + ld.param.u64 %rd7, [triton__0d1d2d3d4de_param_2]; + and.b32 %r52, %r51, 1016; + ld.param.u64 %rd8, [triton__0d1d2d3d4de_param_3]; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r53, %r1, 10; + .loc 1 21 23 + or.b32 %r54, %r53, %r52; + .loc 1 23 20 + shr.s32 %r56, %r54, 31; + shr.u32 %r57, %r56, 24; + add.s32 %r58, %r54, %r57; + shr.s32 %r59, %r58, 8; + .loc 1 23 27 + mul.hi.s32 %r60, %r59, 1431655766; + shr.u32 %r61, %r60, 31; + add.s32 %r62, %r60, %r61; + mul.lo.s32 %r63, %r62, 3; + sub.s32 %r64, %r59, %r63; + and.b32 %r65, %r58, -256; + sub.s32 %r66, %r54, %r65; + .loc 1 25 20 + mul.hi.s32 %r67, %r54, 715827883; + shr.u32 %r68, %r67, 31; + shr.u32 %r69, %r67, 7; + add.s32 %r70, %r69, %r68; + .loc 1 27 40 + shl.b32 %r71, %r70, 8; + .loc 1 27 36 + add.s32 %r72, %r71, %r66; + .loc 1 27 30 + mul.wide.s32 %rd9, %r72, 2; + add.s64 %rd1, %rd5, %rd9; + mov.pred %p1, -1; + .loc 1 27 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + cvt.u16.u32 %rs5, %r4; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; } + cvt.u16.u32 %rs7, %r5; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; } + .loc 1 27 85 + cvt.f32.bf16 %r6, %rs1; + mov.b32 %f1, %r6; + cvt.f32.bf16 %r7, %rs2; + mov.b32 %f2, %r7; + cvt.f32.bf16 %r8, %rs3; + mov.b32 %f3, %r8; + cvt.f32.bf16 %r9, %rs4; + mov.b32 %f4, %r9; + cvt.f32.bf16 %r10, %rs5; + mov.b32 %f5, %r10; + cvt.f32.bf16 %r11, %rs6; + mov.b32 %f6, %r11; + cvt.f32.bf16 %r12, %rs7; + mov.b32 %f7, %r12; + cvt.f32.bf16 %r13, %rs8; + mov.b32 %f8, %r13; + .loc 1 28 30 + add.s64 %rd2, %rd6, %rd9; + .loc 1 28 46 + mov.u32 %r14, 0x0; + mov.u32 %r15, 0x0; + mov.u32 %r16, 0x0; + mov.u32 %r17, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ]; + cvt.u16.u32 %rs9, %r14; + { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; } + cvt.u16.u32 %rs11, %r15; + { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; } + cvt.u16.u32 %rs13, %r16; + { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; } + cvt.u16.u32 %rs15, %r17; + { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; } + .loc 1 28 85 + cvt.f32.bf16 %r18, %rs9; + mov.b32 %f9, %r18; + cvt.f32.bf16 %r19, %rs10; + mov.b32 %f10, %r19; + cvt.f32.bf16 %r20, %rs11; + mov.b32 %f11, %r20; + cvt.f32.bf16 %r21, %rs12; + mov.b32 %f12, %r21; + cvt.f32.bf16 %r22, %rs13; + mov.b32 %f13, %r22; + cvt.f32.bf16 %r23, %rs14; + mov.b32 %f14, %r23; + cvt.f32.bf16 %r24, %rs15; + mov.b32 %f15, %r24; + cvt.f32.bf16 %r25, %rs16; + mov.b32 %f16, %r25; + .loc 1 29 31 + add.s64 %rd3, %rd7, %rd9; + .loc 1 29 47 + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + mov.u32 %r29, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd3 + 0 ]; + cvt.u16.u32 %rs17, %r26; + { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r26; } + cvt.u16.u32 %rs19, %r27; + { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r27; } + cvt.u16.u32 %rs21, %r28; + { .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r28; } + cvt.u16.u32 %rs23, %r29; + { .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r29; } + .loc 1 29 86 + cvt.f32.bf16 %r30, %rs17; + mov.b32 %f17, %r30; + cvt.f32.bf16 %r31, %rs18; + mov.b32 %f18, %r31; + cvt.f32.bf16 %r32, %rs19; + mov.b32 %f19, %r32; + cvt.f32.bf16 %r33, %rs20; + mov.b32 %f20, %r33; + cvt.f32.bf16 %r34, %rs21; + mov.b32 %f21, %r34; + cvt.f32.bf16 %r35, %rs22; + mov.b32 %f22, %r35; + cvt.f32.bf16 %r36, %rs23; + mov.b32 %f23, %r36; + cvt.f32.bf16 %r37, %rs24; + mov.b32 %f24, %r37; + .loc 1 32 19 + setp.eq.s32 %p5, %r64, 2; + .loc 1 34 32 + selp.f32 %f25, %f1, 0f00000000, %p5; + selp.f32 %f26, %f2, 0f00000000, %p5; + selp.f32 %f27, %f3, 0f00000000, %p5; + selp.f32 %f28, %f4, 0f00000000, %p5; + selp.f32 %f29, %f5, 0f00000000, %p5; + selp.f32 %f30, %f6, 0f00000000, %p5; + selp.f32 %f31, %f7, 0f00000000, %p5; + selp.f32 %f32, %f8, 0f00000000, %p5; + .loc 1 36 19 + setp.eq.s32 %p6, %r64, 1; + .loc 1 37 32 + selp.f32 %f33, %f9, 0f00000000, %p6; + selp.f32 %f34, %f10, 0f00000000, %p6; + selp.f32 %f35, %f11, 0f00000000, %p6; + selp.f32 %f36, %f12, 0f00000000, %p6; + selp.f32 %f37, %f13, 0f00000000, %p6; + selp.f32 %f38, %f14, 0f00000000, %p6; + selp.f32 %f39, %f15, 0f00000000, %p6; + selp.f32 %f40, %f16, 0f00000000, %p6; + .loc 1 38 19 + add.f32 %f41, %f25, %f33; + add.f32 %f42, %f26, %f34; + add.f32 %f43, %f27, %f35; + add.f32 %f44, %f28, %f36; + add.f32 %f45, %f29, %f37; + add.f32 %f46, %f30, %f38; + add.f32 %f47, %f31, %f39; + add.f32 %f48, %f32, %f40; + .loc 1 40 20 + setp.eq.s32 %p7, %r64, 0; + .loc 1 41 35 + selp.f32 %f49, %f17, 0f00000000, %p7; + selp.f32 %f50, %f18, 0f00000000, %p7; + selp.f32 %f51, %f19, 0f00000000, %p7; + selp.f32 %f52, %f20, 0f00000000, %p7; + selp.f32 %f53, %f21, 0f00000000, %p7; + selp.f32 %f54, %f22, 0f00000000, %p7; + selp.f32 %f55, %f23, 0f00000000, %p7; + selp.f32 %f56, %f24, 0f00000000, %p7; + .loc 1 42 20 + add.f32 %f57, %f41, %f49; + add.f32 %f58, %f42, %f50; + add.f32 %f59, %f43, %f51; + add.f32 %f60, %f44, %f52; + add.f32 %f61, %f45, %f53; + add.f32 %f62, %f46, %f54; + add.f32 %f63, %f47, %f55; + add.f32 %f64, %f48, %f56; + .loc 1 43 25 + mul.wide.s32 %rd10, %r54, 2; + add.s64 %rd4, %rd8, %rd10; + .loc 1 43 37 + mov.b32 %r38, %f57; + cvt.rn.bf16.f32 %rs25, %r38; + mov.b32 %r39, %f58; + cvt.rn.bf16.f32 %rs26, %r39; + mov.b32 %r40, %f59; + cvt.rn.bf16.f32 %rs27, %r40; + mov.b32 %r41, %f60; + cvt.rn.bf16.f32 %rs28, %r41; + mov.b32 %r42, %f61; + cvt.rn.bf16.f32 %rs29, %r42; + mov.b32 %r43, %f62; + cvt.rn.bf16.f32 %rs30, %r43; + mov.b32 %r44, %f63; + cvt.rn.bf16.f32 %rs31, %r44; + mov.b32 %r45, %f64; + cvt.rn.bf16.f32 %rs32, %r45; + mov.b32 %r73, {%rs25, %rs26}; + mov.b32 %r74, {%rs27, %rs28}; + mov.b32 %r75, {%rs29, %rs30}; + mov.b32 %r76, {%rs31, %rs32}; + @%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r73, %r74, %r75, %r76 }; + .loc 1 43 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/63/c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 184 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 51 +.b8 114 +.b8 55 +.b8 105 +.b8 117 +.b8 114 +.b8 119 +.b8 107 +.b8 53 +.b8 121 +.b8 100 +.b8 108 +.b8 115 +.b8 119 +.b8 104 +.b8 55 +.b8 114 +.b8 118 +.b8 104 +.b8 99 +.b8 109 +.b8 108 +.b8 120 +.b8 50 +.b8 99 +.b8 102 +.b8 114 +.b8 101 +.b8 116 +.b8 108 +.b8 114 +.b8 101 +.b8 119 +.b8 103 +.b8 119 +.b8 54 +.b8 116 +.b8 108 +.b8 106 +.b8 108 +.b8 117 +.b8 114 +.b8 115 +.b8 115 +.b8 104 +.b8 103 +.b8 116 +.b8 102 +.b8 112 +.b8 112 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 51 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 188 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 188 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx b/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..adf9d7e80cd21614d3a9a373b50cd05d52262789 --- /dev/null +++ b/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx @@ -0,0 +1,486 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u32 triton__0d1d2de_param_2 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<10>; + .reg .b16 %rs<7>; + .reg .b32 %r<25>; + .reg .f32 %f<127>; + .reg .b64 %rd<8>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd4, [triton__0d1d2de_param_0]; + ld.param.u64 %rd5, [triton__0d1d2de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r8, %tid.x; + shl.b32 %r9, %r8, 1; + and.b32 %r10, %r9, 510; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r11, %r1, 9; + .loc 1 21 23 + or.b32 %r12, %r11, %r10; + .loc 1 24 34 + mul.wide.s32 %rd6, %r12, 2; + add.s64 %rd7, %rd4, %rd6; + mov.pred %p1, -1; + .loc 1 24 39 + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd7 + 0 ]; + .loc 1 25 30 + add.s64 %rd3, %rd5, %rd6; + .loc 1 25 35 + mov.u32 %r5, 0x0; + @%p1 ld.global.b32 { %r5 }, [ %rd3 + 0 ]; + cvt.u16.u32 %rs3, %r5; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r5; } + .loc 1 25 44 + cvt.f32.bf16 %r6, %rs3; + mov.b32 %f3, %r6; + cvt.f32.bf16 %r7, %rs4; + mov.b32 %f4, %r7; + .loc 1 29 18 + mul.f32 %f5, %f3, 0f3F3504F3; + .loc 1 30 23 + abs.ftz.f32 %f7, %f5; + setp.ge.f32 %p3, %f7, 0f3F8060FE; + mov.f32 %f115, 0f3789CA3C; + mov.f32 %f114, 0fB9F560B9; + mov.f32 %f113, 0f3BAC840B; + mov.f32 %f112, 0fBD0C8162; + mov.f32 %f111, 0f3E1CF906; + mov.f32 %f110, 0f3F6A937E; + mov.f32 %f109, 0f3F20D842; + mov.f32 %f116, %f7; + @%p3 bra $L__BB0_2; + .loc 1 0 23 + mov.f32 %f115, 0f38B1E96A; + mov.f32 %f114, 0fBA574D20; + mov.f32 %f113, 0f3BAAD5EA; + mov.f32 %f112, 0fBCDC1BE7; + mov.f32 %f111, 0f3DE718AF; + mov.f32 %f110, 0fBEC093AC; + mov.f32 %f109, 0f3E0375D3; + .loc 1 30 23 + mul.f32 %f116, %f5, %f5; +$L__BB0_2: + .loc 1 0 0 + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + mul.f32 %f6, %f4, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p4, %f7, 0f3F8060FE; + fma.rn.ftz.f32 %f47, %f115, %f116, %f114; + fma.rn.ftz.f32 %f48, %f47, %f116, %f113; + fma.rn.ftz.f32 %f49, %f48, %f116, %f112; + fma.rn.ftz.f32 %f50, %f49, %f116, %f111; + fma.rn.ftz.f32 %f51, %f50, %f116, %f110; + fma.rn.ftz.f32 %f52, %f51, %f116, %f109; + neg.f32 %f53, %f116; + selp.f32 %f54, %f53, %f5, %p3; + fma.rn.ftz.f32 %f117, %f52, %f54, %f54; + mov.f32 %f108, 0f3F800000; + @%p4 bra $L__BB0_4; + ex2.approx.ftz.f32 %f55, %f117; + sub.f32 %f57, %f108, %f55; + mov.b32 %r13, %f57; + mov.b32 %r14, %f5; + and.b32 %r15, %r14, -2147483648; + or.b32 %r16, %r15, %r13; + mov.b32 %f117, %r16; +$L__BB0_4: + .loc 1 0 0 + cvt.f32.bf16 %r3, %rs1; + cvt.f32.bf16 %r4, %rs2; + .loc 1 30 23 + abs.ftz.f32 %f20, %f6; + setp.ge.f32 %p6, %f20, 0f3F8060FE; + mov.f32 %f124, 0f3789CA3C; + mov.f32 %f123, 0fB9F560B9; + mov.f32 %f122, 0f3BAC840B; + mov.f32 %f121, 0fBD0C8162; + mov.f32 %f120, 0f3E1CF906; + mov.f32 %f119, 0f3F6A937E; + mov.f32 %f118, 0f3F20D842; + mov.f32 %f125, %f20; + @%p6 bra $L__BB0_6; + mul.f32 %f125, %f6, %f6; + mov.f32 %f124, 0f38B1E96A; + mov.f32 %f123, 0fBA574D20; + mov.f32 %f122, 0f3BAAD5EA; + mov.f32 %f121, 0fBCDC1BE7; + mov.f32 %f120, 0f3DE718AF; + mov.f32 %f119, 0fBEC093AC; + mov.f32 %f118, 0f3E0375D3; +$L__BB0_6: + .loc 1 0 0 + mov.b32 %f1, %r3; + mov.b32 %f2, %r4; + .loc 1 30 23 + setp.ltu.f32 %p7, %f20, 0f3F8060FE; + fma.rn.ftz.f32 %f72, %f124, %f125, %f123; + fma.rn.ftz.f32 %f73, %f72, %f125, %f122; + fma.rn.ftz.f32 %f74, %f73, %f125, %f121; + fma.rn.ftz.f32 %f75, %f74, %f125, %f120; + fma.rn.ftz.f32 %f76, %f75, %f125, %f119; + fma.rn.ftz.f32 %f77, %f76, %f125, %f118; + neg.f32 %f78, %f125; + selp.f32 %f79, %f78, %f6, %p6; + fma.rn.ftz.f32 %f126, %f77, %f79, %f79; + @%p7 bra $L__BB0_8; + ex2.approx.ftz.f32 %f80, %f126; + sub.f32 %f82, %f108, %f80; + mov.b32 %r17, %f82; + mov.b32 %r18, %f6; + and.b32 %r19, %r18, -2147483648; + or.b32 %r20, %r19, %r17; + mov.b32 %f126, %r20; +$L__BB0_8: + .loc 1 32 18 + add.f32 %f87, %f117, 0f3F800000; + add.f32 %f88, %f126, 0f3F800000; + .loc 1 35 19 + mul.f32 %f89, %f3, %f3; + mul.f32 %f90, %f4, %f4; + .loc 1 37 20 + mul.f32 %f91, %f89, 0fBF000000; + mul.f32 %f92, %f90, 0fBF000000; + .loc 1 38 19 + mul.f32 %f84, %f91, 0f3FB8AA3B; + ex2.approx.f32 %f83, %f84; + mul.f32 %f86, %f92, 0f3FB8AA3B; + ex2.approx.f32 %f85, %f86; + .loc 1 40 20 + mul.f32 %f93, %f83, 0f3ECC422A; + mul.f32 %f94, %f85, 0f3ECC422A; + .loc 1 41 19 + mul.f32 %f95, %f3, %f93; + mul.f32 %f96, %f4, %f94; + .loc 1 42 20 + fma.rn.f32 %f97, %f87, 0f3F000000, %f95; + fma.rn.f32 %f98, %f88, 0f3F000000, %f96; + .loc 1 43 19 + mul.f32 %f99, %f1, %f97; + mul.f32 %f100, %f2, %f98; + .loc 1 45 40 + mov.b32 %r21, %f99; + cvt.rn.bf16.f32 %rs5, %r21; + mov.b32 %r22, %f100; + cvt.rn.bf16.f32 %rs6, %r22; + mov.b32 %r24, {%rs5, %rs6}; + @%p1 st.global.b32 [ %rd7 + 0 ], { %r24 }; + .loc 1 45 4 + ret; +$L__tmp1: +$L__func_end0: + +} + // .globl __nv_erff +.visible .func (.param .b32 func_retval0) __nv_erff( + .param .b32 __nv_erff_param_0 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<5>; + .reg .f32 %f<49>; +$L__func_begin1: + + ld.param.f32 %f14, [__nv_erff_param_0]; + abs.ftz.f32 %f1, %f14; + setp.ge.f32 %p1, %f1, 0f3F8060FE; + mov.f32 %f46, 0f3789CA3C; + mov.f32 %f45, 0fB9F560B9; + mov.f32 %f44, 0f3BAC840B; + mov.f32 %f43, 0fBD0C8162; + mov.f32 %f42, 0f3E1CF906; + mov.f32 %f41, 0f3F6A937E; + mov.f32 %f40, 0f3F20D842; + mov.f32 %f47, %f1; + @%p1 bra $L__BB1_2; + mul.f32 %f47, %f14, %f14; + mov.f32 %f46, 0f38B1E96A; + mov.f32 %f45, 0fBA574D20; + mov.f32 %f44, 0f3BAAD5EA; + mov.f32 %f43, 0fBCDC1BE7; + mov.f32 %f42, 0f3DE718AF; + mov.f32 %f41, 0fBEC093AC; + mov.f32 %f40, 0f3E0375D3; +$L__BB1_2: + setp.ltu.f32 %p2, %f1, 0f3F8060FE; + fma.rn.ftz.f32 %f29, %f46, %f47, %f45; + fma.rn.ftz.f32 %f30, %f29, %f47, %f44; + fma.rn.ftz.f32 %f31, %f30, %f47, %f43; + fma.rn.ftz.f32 %f32, %f31, %f47, %f42; + fma.rn.ftz.f32 %f33, %f32, %f47, %f41; + fma.rn.ftz.f32 %f34, %f33, %f47, %f40; + neg.f32 %f35, %f47; + selp.f32 %f36, %f35, %f14, %p1; + fma.rn.ftz.f32 %f48, %f34, %f36, %f36; + @%p2 bra $L__BB1_4; + ex2.approx.ftz.f32 %f37, %f48; + mov.f32 %f38, 0f3F800000; + sub.f32 %f39, %f38, %f37; + mov.b32 %r1, %f39; + mov.b32 %r2, %f14; + and.b32 %r3, %r2, -2147483648; + or.b32 %r4, %r3, %r1; + mov.b32 %f48, %r4; +$L__BB1_4: + st.param.f32 [func_retval0+0], %f48; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 53 +.b8 106 +.b8 120 +.b8 97 +.b8 103 +.b8 117 +.b8 120 +.b8 104 +.b8 111 +.b8 51 +.b8 110 +.b8 104 +.b8 114 +.b8 108 +.b8 116 +.b8 53 +.b8 118 +.b8 99 +.b8 105 +.b8 110 +.b8 110 +.b8 122 +.b8 53 +.b8 102 +.b8 101 +.b8 118 +.b8 111 +.b8 100 +.b8 117 +.b8 109 +.b8 108 +.b8 112 +.b8 119 +.b8 110 +.b8 52 +.b8 119 +.b8 121 +.b8 98 +.b8 50 +.b8 118 +.b8 120 +.b8 51 +.b8 120 +.b8 114 +.b8 118 +.b8 101 +.b8 105 +.b8 99 +.b8 101 +.b8 114 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 53 +.b8 106 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir b/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..11cb1f8079ff9ee53ca610feb409d72bab077850 --- /dev/null +++ b/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir @@ -0,0 +1,38 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.398942292> : tensor<512xf32, #blocked> + %cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32, #blocked> + %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32, #blocked> + %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked> + %cst_3 = arith.constant dense<0.707106769> : tensor<512xf32, #blocked> + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<512xi32, #blocked> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked> + %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked> + %12 = arith.extf %11 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> + %13 = arith.mulf %12, %cst_3 : tensor<512xf32, #blocked> + %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32, #blocked>) -> tensor<512xf32, #blocked> + %15 = arith.addf %14, %cst_2 : tensor<512xf32, #blocked> + %16 = arith.mulf %15, %cst_1 : tensor<512xf32, #blocked> + %17 = arith.mulf %12, %12 : tensor<512xf32, #blocked> + %18 = arith.mulf %17, %cst_0 : tensor<512xf32, #blocked> + %19 = math.exp %18 : tensor<512xf32, #blocked> + %20 = arith.mulf %19, %cst : tensor<512xf32, #blocked> + %21 = arith.mulf %12, %20 : tensor<512xf32, #blocked> + %22 = arith.addf %16, %21 : tensor<512xf32, #blocked> + %23 = arith.mulf %8, %22 : tensor<512xf32, #blocked> + %24 = arith.truncf %23 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> + tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..746b65f12455b8baa9865625d37b4f09f8d4737f --- /dev/null +++ b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir @@ -0,0 +1,53 @@ +module { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<16x1xi64> + %cst_0 = arith.constant dense<0> : tensor<16x1xi64> + %cst_1 = arith.constant dense<512> : tensor<16x1xi64> + %cst_2 = arith.constant dense : tensor<16x1xi1> + %cst_3 = arith.constant dense<256> : tensor<16x1xi32> + %cst_4 = arith.constant dense<131072> : tensor<1x128xi32> + %cst_5 = arith.constant dense<120> : tensor<1x128xi32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<16x128xf32> + %c16_i32 = arith.constant 16 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c16_i32 : i32 + %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<16x1xi32> + %5 = arith.addi %4, %3 : tensor<16x1xi32> + %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32> + %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32> + %9 = arith.muli %7, %cst_4 : tensor<1x128xi32> + %10 = tt.broadcast %5 : (tensor<16x1xi32>) -> tensor<16x128xi32> + %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<16x128xi32> + %12 = arith.addi %10, %11 : tensor<16x128xi32> + %13 = tt.splat %arg0 : (!tt.ptr) -> tensor<16x128x!tt.ptr> + %14 = tt.addptr %13, %12 : tensor<16x128x!tt.ptr>, tensor<16x128xi32> + %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<16x128xi1> + %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32> + %17 = arith.addf %16, %cst_6 : tensor<16x128xf32> + %18 = arith.select %15, %17, %cst_6 : tensor<16x128xi1>, tensor<16x128xf32> + %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %35 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %35 : f32 + }) : (tensor<16x128xf32>) -> tensor<16xf32> + %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32> + %21 = arith.divsi %5, %cst_3 : tensor<16x1xi32> + %22 = arith.remsi %5, %cst_3 : tensor<16x1xi32> + %23 = tt.splat %arg1 : (!tt.ptr) -> tensor<16x1x!tt.ptr> + %24 = tt.addptr %23, %21 : tensor<16x1x!tt.ptr>, tensor<16x1xi32> + %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64> + %26 = arith.addi %25, %cst_1 : tensor<16x1xi64> + %27 = arith.cmpi slt, %25, %cst_0 : tensor<16x1xi64> + %28 = arith.select %27, %26, %25 : tensor<16x1xi1>, tensor<16x1xi64> + %29 = arith.muli %28, %cst : tensor<16x1xi64> + %30 = arith.extsi %22 : tensor<16x1xi32> to tensor<16x1xi64> + %31 = arith.addi %30, %29 : tensor<16x1xi64> + %32 = tt.splat %arg2 : (!tt.ptr) -> tensor<16x1x!tt.ptr> + %33 = tt.addptr %32, %31 : tensor<16x1x!tt.ptr>, tensor<16x1xi64> + %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<16x1x!tt.ptr>, tensor<16x1xf32>, tensor<16x1xi1>) -> tensor<16x1xf32> + tt.return + } +} diff --git a/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.cubin b/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..62c1c5d3f3f015c0b9b8a5a5d1e202f0fa50ba27 Binary files /dev/null and b/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.cubin differ diff --git a/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ptx b/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ba059a54f10f929ebe91557a7994d0ecf5eced50 --- /dev/null +++ b/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ptx @@ -0,0 +1,709 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6de7de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3d4d5d6de7de( + .param .u64 triton__0d1d2d3d4d5d6de7de_param_0, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_1, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_2, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_3, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_4, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_5, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_6, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_7 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<33>; + .reg .b16 %rs<9>; + .reg .b32 %r<106>; + .reg .f32 %f<73>; + .reg .b64 %rd<21>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6de7de_param_0]; + ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r72, %tid.x; + and.b32 %r73, %r72, 31; + ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6de7de_param_2]; + ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6de7de_param_3]; + ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6de7de_param_4]; + shl.b32 %r74, %r72, 2; + ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_5]; + and.b32 %r75, %r74, 252; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r76, %r1, 8; + .loc 1 30 36 + or.b32 %r77, %r76, %r75; + .loc 1 30 30 + mul.wide.s32 %rd17, %r77, 2; + add.s64 %rd1, %rd12, %rd17; + mov.b32 %r4, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r4; + @!%p1 mov.u32 %r3, %r4; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + .loc 1 30 67 + cvt.f32.bf16 %r6, %rs1; + mov.b32 %f1, %r6; + cvt.f32.bf16 %r7, %rs2; + mov.b32 %f2, %r7; + cvt.f32.bf16 %r8, %rs3; + mov.b32 %f3, %r8; + cvt.f32.bf16 %r9, %rs4; + mov.b32 %f4, %r9; + .loc 1 31 30 + mul.wide.u32 %rd18, %r75, 4; + add.s64 %rd2, %rd13, %rd18; + .loc 1 31 35 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r4; + @!%p1 mov.u32 %r11, %r4; + @!%p1 mov.u32 %r12, %r4; + @!%p1 mov.u32 %r13, %r4; + mov.b32 %f5, %r10; + mov.b32 %f6, %r11; + mov.b32 %f7, %r12; + mov.b32 %f8, %r13; + .loc 1 32 30 + mul.wide.s32 %rd19, %r77, 4; + add.s64 %rd3, %rd14, %rd19; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r4; + @!%p1 mov.u32 %r19, %r4; + @!%p1 mov.u32 %r20, %r4; + @!%p1 mov.u32 %r21, %r4; + mov.b32 %f9, %r18; + mov.b32 %f10, %r19; + mov.b32 %f11, %r20; + mov.b32 %f12, %r21; + .loc 1 33 35 + add.s64 %rd4, %rd11, %rd19; + .loc 1 33 51 + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + mov.u32 %r29, 0x0; + @%p1 ld.global.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r26, %r4; + @!%p1 mov.u32 %r27, %r4; + @!%p1 mov.u32 %r28, %r4; + @!%p1 mov.u32 %r29, %r4; + mov.b32 %f13, %r26; + mov.b32 %f14, %r27; + mov.b32 %f15, %r28; + mov.b32 %f16, %r29; + .loc 1 34 31 + mul.wide.s32 %rd20, %r1, 4; + add.s64 %rd5, %rd15, %rd20; + .loc 1 34 36 + mov.u32 %r51, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r51 }, [ %rd5 + 0 ]; + mov.u32 %r35, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r35 }, [ %rd5 + 0 ]; + mov.u32 %r36, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r36 }, [ %rd5 + 0 ]; + mov.u32 %r37, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r37 }, [ %rd5 + 0 ]; + .loc 1 36 18 + mul.f32 %f17, %f1, %f5; + mul.f32 %f18, %f2, %f6; + mul.f32 %f19, %f3, %f7; + mul.f32 %f20, %f4, %f8; +$L__tmp1: + .loc 2 233 15 + fma.rn.f32 %f21, %f1, %f5, %f18; + fma.rn.f32 %f22, %f3, %f7, %f21; + fma.rn.f32 %f23, %f4, %f8, %f22; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r78, %f23; + shfl.sync.bfly.b32 %r79, %r78, 16, 31, -1; + mov.b32 %f24, %r79; +$L__tmp3: + .loc 2 233 15 + add.f32 %f25, %f23, %f24; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r80, %f25; + shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1; + mov.b32 %f26, %r81; +$L__tmp5: + .loc 2 233 15 + add.f32 %f27, %f25, %f26; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r82, %f27; + shfl.sync.bfly.b32 %r83, %r82, 4, 31, -1; + mov.b32 %f28, %r83; +$L__tmp7: + .loc 2 233 15 + add.f32 %f29, %f27, %f28; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r84, %f29; + shfl.sync.bfly.b32 %r85, %r84, 2, 31, -1; + mov.b32 %f30, %r85; +$L__tmp9: + .loc 2 233 15 + add.f32 %f31, %f29, %f30; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r86, %f31; + shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1; + mov.b32 %f32, %r87; +$L__tmp11: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p23, %r73, 0; + shr.u32 %r88, %r72, 3; + and.b32 %r89, %r88, 4; + mov.u32 %r90, global_smem; + add.s32 %r38, %r90, %r89; + mov.b32 %r39, %f33; + @%p23 st.shared.b32 [ %r38 + 0 ], %r39; + bar.sync 0; + setp.lt.s32 %p24, %r72, 2; + add.s32 %r41, %r90, %r74; + @%p24 ld.shared.b32 %r40, [ %r41 + 0 ]; + mov.b32 %f34, %r40; + shfl.sync.bfly.b32 %r91, %r40, 1, 31, -1; + mov.b32 %f35, %r91; +$L__tmp13: + .loc 2 233 15 + add.f32 %f36, %f34, %f35; +$L__tmp14: + .loc 2 243 36 + and.b32 %r92, %r72, 1; + setp.eq.b32 %p31, %r92, 1; + not.pred %p32, %p31; + and.pred %p25, %p24, %p32; + mov.b32 %r43, %f36; + @%p25 st.shared.b32 [ %r41 + 0 ], %r43; + bar.sync 0; + ld.shared.f32 %f37, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f38, %f37, 0f00000000; +$L__tmp16: + .loc 1 40 18 + mul.f32 %f39, %f18, %f10; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f40, %f17, %f9, %f39; + fma.rn.f32 %f41, %f19, %f11, %f40; + fma.rn.f32 %f42, %f20, %f12, %f41; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r93, %f42; + shfl.sync.bfly.b32 %r94, %r93, 16, 31, -1; + mov.b32 %f43, %r94; +$L__tmp20: + .loc 2 233 15 + add.f32 %f44, %f42, %f43; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r95, %f44; + shfl.sync.bfly.b32 %r96, %r95, 8, 31, -1; + mov.b32 %f45, %r96; +$L__tmp22: + .loc 2 233 15 + add.f32 %f46, %f44, %f45; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r97, %f46; + shfl.sync.bfly.b32 %r98, %r97, 4, 31, -1; + mov.b32 %f47, %r98; +$L__tmp24: + .loc 2 233 15 + add.f32 %f48, %f46, %f47; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r99, %f48; + shfl.sync.bfly.b32 %r100, %r99, 2, 31, -1; + mov.b32 %f49, %r100; +$L__tmp26: + .loc 2 233 15 + add.f32 %f50, %f48, %f49; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r101, %f50; + shfl.sync.bfly.b32 %r102, %r101, 1, 31, -1; + mov.b32 %f51, %r102; +$L__tmp28: + .loc 2 233 15 + add.f32 %f52, %f50, %f51; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r45, %f52; + @%p23 st.shared.b32 [ %r38 + 0 ], %r45; + bar.sync 0; + @%p24 ld.shared.b32 %r46, [ %r41 + 0 ]; + mov.b32 %f53, %r46; + shfl.sync.bfly.b32 %r103, %r46, 1, 31, -1; + mov.b32 %f54, %r103; +$L__tmp30: + .loc 2 233 15 + add.f32 %f55, %f53, %f54; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r49, %f55; + @%p25 st.shared.b32 [ %r41 + 0 ], %r49; + bar.sync 0; + ld.shared.f32 %f56, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f57, %f56, 0f00000000; + mov.b32 %r52, 1132462080; +$L__tmp33: + .loc 1 45 20 + div.full.f32 %r50, %r51, %r52; + mov.b32 %f58, %r50; + .loc 1 47 20 + neg.f32 %f59, %f38; + fma.rn.f32 %f60, %f17, 0f43800000, %f59; + fma.rn.f32 %f61, %f18, 0f43800000, %f59; + fma.rn.f32 %f62, %f19, 0f43800000, %f59; + fma.rn.f32 %f63, %f20, 0f43800000, %f59; + .loc 1 49 20 + neg.f32 %f64, %f57; + fma.rn.f32 %f65, %f64, %f9, %f60; + fma.rn.f32 %f66, %f64, %f10, %f61; + fma.rn.f32 %f67, %f64, %f11, %f62; + fma.rn.f32 %f68, %f64, %f12, %f63; + .loc 1 51 20 + fma.rn.f32 %f69, %f58, %f65, %f13; + fma.rn.f32 %f70, %f58, %f66, %f14; + fma.rn.f32 %f71, %f58, %f67, %f15; + fma.rn.f32 %f72, %f58, %f68, %f16; + .loc 1 53 51 + mov.b32 %r62, %f69; + mov.b32 %r63, %f70; + mov.b32 %r64, %f71; + mov.b32 %r65, %f72; + @%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r62, %r63, %r64, %r65 }; + .loc 1 54 25 + add.s64 %rd10, %rd16, %rd17; + .loc 1 54 48 + cvt.rn.bf16.f32 %rs5, %r62; + cvt.rn.bf16.f32 %rs6, %r63; + cvt.rn.bf16.f32 %rs7, %r64; + cvt.rn.bf16.f32 %rs8, %r65; + mov.b32 %r104, {%rs5, %rs6}; + mov.b32 %r105, {%rs7, %rs8}; + @%p1 st.global.v2.b32 [ %rd10 + 0 ], { %r104, %r105 }; + .loc 1 54 4 + ret; +$L__tmp34: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/rn/crnynbmsd2yell2lpjymb46rttfaea2xjwsbxr75j54gctfgi457.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 399 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 114 +.b8 110 +.b8 121 +.b8 110 +.b8 98 +.b8 109 +.b8 115 +.b8 100 +.b8 50 +.b8 121 +.b8 101 +.b8 108 +.b8 108 +.b8 50 +.b8 108 +.b8 112 +.b8 106 +.b8 121 +.b8 109 +.b8 98 +.b8 52 +.b8 54 +.b8 114 +.b8 116 +.b8 116 +.b8 102 +.b8 97 +.b8 101 +.b8 97 +.b8 50 +.b8 120 +.b8 106 +.b8 119 +.b8 115 +.b8 98 +.b8 120 +.b8 114 +.b8 55 +.b8 53 +.b8 106 +.b8 53 +.b8 52 +.b8 103 +.b8 99 +.b8 116 +.b8 102 +.b8 103 +.b8 105 +.b8 52 +.b8 53 +.b8 55 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 114 +.b8 110 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 39 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 39 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 39 +.b8 44 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 43 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 43 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 43 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 403 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 403 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir b/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..30b3e01578542f25f42cefbfd2799f6e4511693e --- /dev/null +++ b/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir @@ -0,0 +1,66 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked> + %cst_1 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %16 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %17 = tt.addptr %16, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %18 = tt.load %17, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %19 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %20 = tt.splat %19 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %22 = arith.mulf %9, %12 : tensor<256xf32, #blocked> + %23 = arith.select %2, %22, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %24 = "tt.reduce"(%23) <{axis = 0 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32): + %43 = arith.addf %arg8, %arg9 : f32 + tt.reduce.return %43 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %25 = arith.addf %24, %cst_1 : f32 + %26 = arith.mulf %22, %15 : tensor<256xf32, #blocked> + %27 = arith.select %2, %26, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32): + %43 = arith.addf %arg8, %arg9 : f32 + tt.reduce.return %43 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %29 = arith.addf %28, %cst_1 : f32 + %30 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked> + %31 = arith.mulf %22, %cst_3 : tensor<256xf32, #blocked> + %32 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked> + %33 = arith.subf %31, %32 : tensor<256xf32, #blocked> + %34 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked> + %35 = arith.mulf %15, %34 : tensor<256xf32, #blocked> + %36 = arith.subf %33, %35 : tensor<256xf32, #blocked> + %37 = tt.broadcast %30 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %38 = arith.mulf %37, %36 : tensor<256xf32, #blocked> + %39 = arith.addf %18, %38 : tensor<256xf32, #blocked> + tt.store %17, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %40 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %41 = tt.addptr %40, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %42 = arith.truncf %39 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %41, %42, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..7b51fb4246e6c2d3e13cf8e462dc2485b2c851a1 Binary files /dev/null and b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin differ diff --git a/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..516330e5c027335adcb5b898329b1a17a0146ec2 --- /dev/null +++ b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir @@ -0,0 +1,15 @@ +module { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32> + %4 = arith.addi %3, %2 : tensor<1024xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32> + tt.return + } +} diff --git a/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..065b475f12b2e3090afe97059ea1ff3dd3b770d0 Binary files /dev/null and b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin differ diff --git a/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..7d915f749f517d5b7fd431b32400f530961585b2 --- /dev/null +++ b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir @@ -0,0 +1,38 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.398942292> : tensor<1024xf32, #blocked> + %cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32, #blocked> + %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked> + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked> + %cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked> + %8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked> + %12 = arith.extf %11 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> + %13 = arith.mulf %12, %cst_3 : tensor<1024xf32, #blocked> + %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked> + %15 = arith.addf %14, %cst_2 : tensor<1024xf32, #blocked> + %16 = arith.mulf %15, %cst_1 : tensor<1024xf32, #blocked> + %17 = arith.mulf %12, %12 : tensor<1024xf32, #blocked> + %18 = arith.mulf %17, %cst_0 : tensor<1024xf32, #blocked> + %19 = math.exp %18 : tensor<1024xf32, #blocked> + %20 = arith.mulf %19, %cst : tensor<1024xf32, #blocked> + %21 = arith.mulf %12, %20 : tensor<1024xf32, #blocked> + %22 = arith.addf %16, %21 : tensor<1024xf32, #blocked> + %23 = arith.mulf %8, %22 : tensor<1024xf32, #blocked> + %24 = arith.truncf %23 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> + tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..297b7574c8f32a6d7465900b7eb16ea68ebde7dd --- /dev/null +++ b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir @@ -0,0 +1,37 @@ +module { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.398942292> : tensor<1024xf32> + %cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32> + %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32> + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32> + %cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32> + %4 = arith.addi %3, %2 : tensor<1024xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16> + %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16> + %12 = arith.extf %11 : tensor<1024xbf16> to tensor<1024xf32> + %13 = arith.mulf %12, %cst_3 : tensor<1024xf32> + %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32> + %15 = arith.addf %14, %cst_2 : tensor<1024xf32> + %16 = arith.mulf %15, %cst_1 : tensor<1024xf32> + %17 = arith.mulf %12, %12 : tensor<1024xf32> + %18 = arith.mulf %17, %cst_0 : tensor<1024xf32> + %19 = math.exp %18 : tensor<1024xf32> + %20 = arith.mulf %19, %cst : tensor<1024xf32> + %21 = arith.mulf %12, %20 : tensor<1024xf32> + %22 = arith.addf %16, %21 : tensor<1024xf32> + %23 = arith.mulf %8, %22 : tensor<1024xf32> + %24 = arith.truncf %23 : tensor<1024xf32> to tensor<1024xbf16> + tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16> + tt.return + } +} diff --git a/.triton/dump/962d1809855a53123762906133b1d960/triton_.ttir b/.triton/dump/962d1809855a53123762906133b1d960/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2962f6703fcea2c27fc6abb5ca101dc337f76a29 --- /dev/null +++ b/.triton/dump/962d1809855a53123762906133b1d960/triton_.ttir @@ -0,0 +1,17 @@ +module { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32> + %cst_0 = arith.constant dense<12865792> : tensor<1024xi32> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32> + %4 = arith.addi %3, %2 : tensor<1024xi32> + %5 = arith.cmpi slt, %4, %cst_0 : tensor<1024xi32> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32> + tt.return + } +} diff --git a/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..ba7bb4edb122f5853209f6a24260d4f7100dadec --- /dev/null +++ b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir @@ -0,0 +1,368 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3d4d5d6d7d8d9d10de11de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, i32 %10, i32 %11) local_unnamed_addr !dbg !5 { + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %14 = and i32 %13, 31, !dbg !8 + %15 = lshr i32 %13, 5, !dbg !8 + %16 = shl i32 %13, 2, !dbg !8 + %17 = and i32 %16, 60, !dbg !8 + %18 = and i32 %15, 3, !dbg !8 + %19 = lshr i32 %14, 1, !dbg !8 + %20 = shl nuw nsw i32 %18, 4, !dbg !8 + %21 = or i32 %20, %19, !dbg !8 + %22 = and i32 %16, 4, !dbg !9 + %23 = lshr i32 %14, 4, !dbg !9 + %24 = shl nuw nsw i32 %18, 1, !dbg !9 + %25 = or i32 %24, %23, !dbg !9 + %26 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 + %27 = shl i32 %26, 6, !dbg !11 + %28 = or i32 %27, %17, !dbg !12 + %29 = or i32 %27, %21, !dbg !12 + %.frozen = freeze i32 %28 + %30 = sdiv i32 %.frozen, 256, !dbg !13 + %31 = mul i32 %30, 256 + %.decomposed = sub i32 %.frozen, %31 + %32 = sdiv i32 %29, 256, !dbg !13 + %33 = shl i32 %30, 15, !dbg !14 + %34 = shl nsw i32 %32, 7, !dbg !15 + %35 = add i32 %33, %.decomposed + %36 = mul nuw nsw i32 %17, 12 + %37 = or i32 %25, %36 + %38 = zext nneg i32 %37 to i64 + %39 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %38 + %40 = or i32 %36, 12 + %41 = add nuw nsw i32 %40, %25 + %42 = zext nneg i32 %41 to i64 + %43 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %42 + %44 = add nuw nsw i32 %36, 24 + %45 = or i32 %44, %25 + %46 = zext nneg i32 %45 to i64 + %47 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %46 + %48 = add nuw nsw i32 %36, 36 + %49 = add nuw nsw i32 %48, %25 + %50 = zext nneg i32 %49 to i64 + %51 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %50 + %52 = mul nuw nsw i32 %21, 12 + %53 = add nuw nsw i32 %52, %22 + %54 = zext nneg i32 %53 to i64 + %55 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %54 + %56 = getelementptr float, ptr addrspace(3) @global_smem, i64 %38 + %57 = getelementptr float, ptr addrspace(3) @global_smem, i64 %42 + %58 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46 + %59 = getelementptr float, ptr addrspace(3) @global_smem, i64 %50 + %60 = getelementptr float, ptr addrspace(3) @global_smem, i64 %54 + %61 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 1 + %62 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 2 + %63 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 3 + br label %64, !dbg !16 + +64: ; preds = %12, %64 + %65 = phi i32 [ 0, %12 ], [ %205, %64 ] + %66 = phi <8 x float> [ zeroinitializer, %12 ], [ %204, %64 ] + %67 = or i32 %65, %22, !dbg !17 + %68 = or i32 %65, %25, !dbg !17 + %69 = shl i32 %68, 8, !dbg !18 + %70 = add i32 %35, %69, !dbg !19 + %71 = sext i32 %70 to i64, !dbg !20 + %72 = getelementptr i16, ptr addrspace(1) %0, i64 %71, !dbg !20 + %73 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %72, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21 + %74 = extractvalue { i32, i32 } %73, 0, !dbg !21 + %75 = extractvalue { i32, i32 } %73, 1, !dbg !21 + %76 = trunc i32 %74 to i16, !dbg !21 + %extelt.offset = lshr i32 %74, 16, !dbg !21 + %77 = trunc i32 %extelt.offset to i16, !dbg !21 + %78 = trunc i32 %75 to i16, !dbg !21 + %extelt.offset1 = lshr i32 %75, 16, !dbg !21 + %79 = trunc i32 %extelt.offset1 to i16, !dbg !21 + tail call void @llvm.nvvm.barrier0(), !dbg !22 + %80 = insertelement <1 x i16> undef, i16 %76, i64 0, !dbg !22 + store <1 x i16> %80, ptr addrspace(3) %39, align 2, !dbg !22 + %81 = insertelement <1 x i16> undef, i16 %77, i64 0, !dbg !22 + store <1 x i16> %81, ptr addrspace(3) %43, align 2, !dbg !22 + %82 = insertelement <1 x i16> undef, i16 %78, i64 0, !dbg !22 + store <1 x i16> %82, ptr addrspace(3) %47, align 2, !dbg !22 + %83 = insertelement <1 x i16> undef, i16 %79, i64 0, !dbg !22 + store <1 x i16> %83, ptr addrspace(3) %51, align 2, !dbg !22 + tail call void @llvm.nvvm.barrier0(), !dbg !22 + %84 = load i16, ptr addrspace(3) %55, align 8, !dbg !22 + %85 = load i16, ptr addrspace(3) %61, align 2, !dbg !22 + %86 = load i16, ptr addrspace(3) %62, align 4, !dbg !22 + %87 = load i16, ptr addrspace(3) %63, align 2, !dbg !22 + %88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #3, !dbg !22 + %89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #3, !dbg !22 + %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #3, !dbg !22 + %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %87) #3, !dbg !22 + %92 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !23 + %93 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %92, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24 + %94 = extractvalue { i32, i32, i32, i32 } %93, 0, !dbg !24 + %95 = extractvalue { i32, i32, i32, i32 } %93, 1, !dbg !24 + %96 = extractvalue { i32, i32, i32, i32 } %93, 2, !dbg !24 + %97 = extractvalue { i32, i32, i32, i32 } %93, 3, !dbg !24 + %98 = bitcast i32 %94 to float, !dbg !24 + %99 = bitcast i32 %95 to float, !dbg !24 + %100 = bitcast i32 %96 to float, !dbg !24 + %101 = bitcast i32 %97 to float, !dbg !24 + tail call void @llvm.nvvm.barrier0(), !dbg !24 + %102 = insertelement <1 x float> undef, float %98, i64 0, !dbg !24 + store <1 x float> %102, ptr addrspace(3) %56, align 4, !dbg !24 + %103 = insertelement <1 x float> undef, float %99, i64 0, !dbg !24 + store <1 x float> %103, ptr addrspace(3) %57, align 4, !dbg !24 + %104 = insertelement <1 x float> undef, float %100, i64 0, !dbg !24 + store <1 x float> %104, ptr addrspace(3) %58, align 4, !dbg !24 + %105 = insertelement <1 x float> undef, float %101, i64 0, !dbg !24 + store <1 x float> %105, ptr addrspace(3) %59, align 4, !dbg !24 + tail call void @llvm.nvvm.barrier0(), !dbg !24 + %106 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !24 + %107 = getelementptr i16, ptr addrspace(1) %2, i64 %71, !dbg !25 + %108 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !26 + %109 = extractvalue { i32, i32 } %108, 0, !dbg !26 + %110 = extractvalue { i32, i32 } %108, 1, !dbg !26 + %111 = trunc i32 %109 to i16, !dbg !26 + %extelt.offset2 = lshr i32 %109, 16, !dbg !26 + %112 = trunc i32 %extelt.offset2 to i16, !dbg !26 + %113 = trunc i32 %110 to i16, !dbg !26 + %extelt.offset3 = lshr i32 %110, 16, !dbg !26 + %114 = trunc i32 %extelt.offset3 to i16, !dbg !26 + %115 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #3, !dbg !27 + %116 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #3, !dbg !27 + %117 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #3, !dbg !27 + %118 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #3, !dbg !27 + %119 = add i32 %67, %34, !dbg !28 + %120 = sext i32 %119 to i64, !dbg !29 + %121 = getelementptr float, ptr addrspace(1) %3, i64 %120, !dbg !29 + %122 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %121, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !30 + %123 = extractvalue { i32, i32, i32, i32 } %122, 0, !dbg !30 + %124 = extractvalue { i32, i32, i32, i32 } %122, 1, !dbg !30 + %125 = extractvalue { i32, i32, i32, i32 } %122, 2, !dbg !30 + %126 = extractvalue { i32, i32, i32, i32 } %122, 3, !dbg !30 + %127 = getelementptr float, ptr addrspace(1) %4, i64 %120, !dbg !31 + %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %127, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !32 + %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !32 + %130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !32 + %131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !32 + %132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !32 + %133 = getelementptr i16, ptr addrspace(1) %5, i64 %71, !dbg !33 + %134 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %133, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !34 + %135 = extractvalue { i32, i32 } %134, 0, !dbg !34 + %136 = extractvalue { i32, i32 } %134, 1, !dbg !34 + %137 = trunc i32 %135 to i16, !dbg !34 + %extelt.offset4 = lshr i32 %135, 16, !dbg !34 + %138 = trunc i32 %extelt.offset4 to i16, !dbg !34 + %139 = trunc i32 %136 to i16, !dbg !34 + %extelt.offset5 = lshr i32 %136, 16, !dbg !34 + %140 = trunc i32 %extelt.offset5 to i16, !dbg !34 + tail call void @llvm.nvvm.barrier0(), !dbg !35 + %141 = insertelement <1 x i16> undef, i16 %137, i64 0, !dbg !35 + store <1 x i16> %141, ptr addrspace(3) %39, align 2, !dbg !35 + %142 = insertelement <1 x i16> undef, i16 %138, i64 0, !dbg !35 + store <1 x i16> %142, ptr addrspace(3) %43, align 2, !dbg !35 + %143 = insertelement <1 x i16> undef, i16 %139, i64 0, !dbg !35 + store <1 x i16> %143, ptr addrspace(3) %47, align 2, !dbg !35 + %144 = insertelement <1 x i16> undef, i16 %140, i64 0, !dbg !35 + store <1 x i16> %144, ptr addrspace(3) %51, align 2, !dbg !35 + tail call void @llvm.nvvm.barrier0(), !dbg !35 + %145 = load i16, ptr addrspace(3) %55, align 8, !dbg !35 + %146 = load i16, ptr addrspace(3) %61, align 2, !dbg !35 + %147 = load i16, ptr addrspace(3) %62, align 4, !dbg !35 + %148 = load i16, ptr addrspace(3) %63, align 2, !dbg !35 + %149 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %145) #3, !dbg !35 + %150 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %146) #3, !dbg !35 + %151 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %147) #3, !dbg !35 + %152 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %148) #3, !dbg !35 + %153 = getelementptr float, ptr addrspace(1) %6, i64 %120, !dbg !36 + %154 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %153, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !37 + %155 = extractvalue { i32, i32, i32, i32 } %154, 0, !dbg !37 + %156 = extractvalue { i32, i32, i32, i32 } %154, 1, !dbg !37 + %157 = extractvalue { i32, i32, i32, i32 } %154, 2, !dbg !37 + %158 = extractvalue { i32, i32, i32, i32 } %154, 3, !dbg !37 + %159 = getelementptr float, ptr addrspace(1) %7, i64 %120, !dbg !38 + %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %159, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !39 + %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !39 + %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !39 + %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !39 + %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !39 + %165 = fadd float %115, %98, !dbg !40 + %166 = fadd float %116, %99, !dbg !40 + %167 = fadd float %117, %100, !dbg !40 + %168 = fadd float %118, %101, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !40 + %169 = insertelement <1 x float> undef, float %165, i64 0, !dbg !40 + store <1 x float> %169, ptr addrspace(3) %56, align 4, !dbg !40 + %170 = insertelement <1 x float> undef, float %166, i64 0, !dbg !40 + store <1 x float> %170, ptr addrspace(3) %57, align 4, !dbg !40 + %171 = insertelement <1 x float> undef, float %167, i64 0, !dbg !40 + store <1 x float> %171, ptr addrspace(3) %58, align 4, !dbg !40 + %172 = insertelement <1 x float> undef, float %168, i64 0, !dbg !40 + store <1 x float> %172, ptr addrspace(3) %59, align 4, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !40 + %173 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !40 + %174 = insertelement <8 x i32> poison, i32 %155, i64 0, !dbg !37 + %175 = insertelement <8 x i32> %174, i32 %156, i64 1, !dbg !37 + %176 = insertelement <8 x i32> %175, i32 %157, i64 2, !dbg !37 + %177 = insertelement <8 x i32> %176, i32 %158, i64 3, !dbg !37 + %178 = insertelement <8 x i32> %177, i32 %123, i64 4, !dbg !37 + %179 = insertelement <8 x i32> %178, i32 %124, i64 5, !dbg !37 + %180 = insertelement <8 x i32> %179, i32 %125, i64 6, !dbg !37 + %181 = insertelement <8 x i32> %180, i32 %126, i64 7, !dbg !37 + %182 = bitcast <8 x i32> %181 to <8 x float>, !dbg !37 + %183 = insertelement <8 x i32> poison, i32 %161, i64 0, !dbg !39 + %184 = insertelement <8 x i32> %183, i32 %162, i64 1, !dbg !39 + %185 = insertelement <8 x i32> %184, i32 %163, i64 2, !dbg !39 + %186 = insertelement <8 x i32> %185, i32 %164, i64 3, !dbg !39 + %187 = insertelement <8 x i32> %186, i32 %129, i64 4, !dbg !39 + %188 = insertelement <8 x i32> %187, i32 %130, i64 5, !dbg !39 + %189 = insertelement <8 x i32> %188, i32 %131, i64 6, !dbg !39 + %190 = insertelement <8 x i32> %189, i32 %132, i64 7, !dbg !39 + %191 = bitcast <8 x i32> %190 to <8 x float>, !dbg !39 + %192 = shufflevector <4 x float> %106, <4 x float> %173, <8 x i32> , !dbg !41 + %193 = fsub <8 x float> %192, %182, !dbg !41 + %194 = fmul <8 x float> %193, %191, !dbg !42 + %195 = insertelement <8 x float> poison, float %149, i64 0, !dbg !43 + %196 = insertelement <8 x float> %195, float %150, i64 1, !dbg !43 + %197 = insertelement <8 x float> %196, float %151, i64 2, !dbg !43 + %198 = insertelement <8 x float> %197, float %152, i64 3, !dbg !43 + %199 = insertelement <8 x float> %198, float %88, i64 4, !dbg !43 + %200 = insertelement <8 x float> %199, float %89, i64 5, !dbg !43 + %201 = insertelement <8 x float> %200, float %90, i64 6, !dbg !43 + %202 = insertelement <8 x float> %201, float %91, i64 7, !dbg !43 + %203 = fmul <8 x float> %202, %194, !dbg !43 + %204 = fadd <8 x float> %66, %203, !dbg !44 + %205 = add nuw nsw i32 %65, 8, !dbg !16 + %206 = icmp ult i32 %65, 120, !dbg !16 + br i1 %206, label %64, label %207, !dbg !16 + +207: ; preds = %64 + %208 = and i32 %13, 63, !dbg !8 + %209 = or i32 %27, %208, !dbg !12 + %shift = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !45 + %210 = fadd <8 x float> %204, %shift, !dbg !45 + %shift28 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !45 + %211 = fadd <8 x float> %shift28, %210, !dbg !45 + %shift29 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !45 + %212 = fadd <8 x float> %shift29, %211, !dbg !45 + %213 = extractelement <8 x float> %212, i64 4, !dbg !45 + %214 = bitcast float %213 to i32, !dbg !51 + %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !51 + %216 = bitcast i32 %215 to float, !dbg !51 + %217 = fadd float %213, %216, !dbg !45 + tail call void @llvm.nvvm.barrier0(), !dbg !53 + %218 = zext nneg i32 %21 to i64, !dbg !53 + %219 = getelementptr float, ptr addrspace(3) @global_smem, i64 %218, !dbg !53 + %220 = insertelement <1 x float> undef, float %217, i64 0, !dbg !53 + store <1 x float> %220, ptr addrspace(3) %219, align 4, !dbg !53 + tail call void @llvm.nvvm.barrier0(), !dbg !53 + %221 = zext nneg i32 %208 to i64, !dbg !53 + %222 = getelementptr float, ptr addrspace(3) @global_smem, i64 %221, !dbg !53 + %223 = load i32, ptr addrspace(3) %222, align 4, !dbg !53 + %224 = sext i32 %209 to i64, !dbg !54 + %225 = getelementptr float, ptr addrspace(1) %8, i64 %224, !dbg !54 + %226 = and i32 %13, 64, !dbg !55 + %227 = icmp eq i32 %226, 0, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %223, ptr addrspace(1) %225, i1 %227) #3, !dbg !55 + %shift30 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !56 + %228 = fadd <8 x float> %204, %shift30, !dbg !56 + %shift31 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !56 + %229 = fadd <8 x float> %shift31, %228, !dbg !56 + %shift32 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !56 + %230 = fadd <8 x float> %shift32, %229, !dbg !56 + %231 = extractelement <8 x float> %230, i64 0, !dbg !56 + %232 = bitcast float %231 to i32, !dbg !59 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !59 + %234 = bitcast i32 %233 to float, !dbg !59 + %235 = fadd float %231, %234, !dbg !56 + tail call void @llvm.nvvm.barrier0(), !dbg !61 + %236 = insertelement <1 x float> undef, float %235, i64 0, !dbg !61 + store <1 x float> %236, ptr addrspace(3) %219, align 4, !dbg !61 + tail call void @llvm.nvvm.barrier0(), !dbg !61 + %237 = load i32, ptr addrspace(3) %222, align 4, !dbg !61 + %238 = getelementptr float, ptr addrspace(1) %9, i64 %224, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %237, ptr addrspace(1) %238, i1 %227) #3, !dbg !63 + ret void, !dbg !64 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py", directory: "/tmp/torchinductor_root/3x") +!3 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", linkageName: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 21, column: 28, scope: !5) +!11 = !DILocation(line: 21, column: 33, scope: !5) +!12 = !DILocation(line: 22, column: 23, scope: !5) +!13 = !DILocation(line: 26, column: 20, scope: !5) +!14 = !DILocation(line: 34, column: 57, scope: !5) +!15 = !DILocation(line: 37, column: 44, scope: !5) +!16 = !DILocation(line: 30, column: 36, scope: !5) +!17 = !DILocation(line: 31, column: 27, scope: !5) +!18 = !DILocation(line: 34, column: 44, scope: !5) +!19 = !DILocation(line: 34, column: 51, scope: !5) +!20 = !DILocation(line: 34, column: 34, scope: !5) +!21 = !DILocation(line: 34, column: 63, scope: !5) +!22 = !DILocation(line: 34, column: 115, scope: !5) +!23 = !DILocation(line: 35, column: 34, scope: !5) +!24 = !DILocation(line: 35, column: 63, scope: !5) +!25 = !DILocation(line: 36, column: 34, scope: !5) +!26 = !DILocation(line: 36, column: 63, scope: !5) +!27 = !DILocation(line: 36, column: 115, scope: !5) +!28 = !DILocation(line: 37, column: 40, scope: !5) +!29 = !DILocation(line: 37, column: 34, scope: !5) +!30 = !DILocation(line: 37, column: 50, scope: !5) +!31 = !DILocation(line: 38, column: 34, scope: !5) +!32 = !DILocation(line: 38, column: 50, scope: !5) +!33 = !DILocation(line: 39, column: 35, scope: !5) +!34 = !DILocation(line: 39, column: 64, scope: !5) +!35 = !DILocation(line: 39, column: 116, scope: !5) +!36 = !DILocation(line: 40, column: 35, scope: !5) +!37 = !DILocation(line: 40, column: 51, scope: !5) +!38 = !DILocation(line: 41, column: 35, scope: !5) +!39 = !DILocation(line: 41, column: 51, scope: !5) +!40 = !DILocation(line: 44, column: 22, scope: !5) +!41 = !DILocation(line: 52, column: 23, scope: !5) +!42 = !DILocation(line: 53, column: 24, scope: !5) +!43 = !DILocation(line: 54, column: 24, scope: !5) +!44 = !DILocation(line: 57, column: 40, scope: !5) +!45 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !49) +!46 = distinct !DILexicalBlockFile(scope: !48, file: !47, discriminator: 0) +!47 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!48 = distinct !DILexicalBlockFile(scope: !5, file: !47, discriminator: 0) +!49 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !50) +!50 = !DILocation(line: 58, column: 27, scope: !46) +!51 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !52) +!52 = !DILocation(line: 58, column: 27, scope: !48) +!53 = !DILocation(line: 58, column: 30, scope: !5) +!54 = !DILocation(line: 59, column: 25, scope: !5) +!55 = !DILocation(line: 59, column: 37, scope: !5) +!56 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !57) +!57 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !58) +!58 = !DILocation(line: 60, column: 27, scope: !46) +!59 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !60) +!60 = !DILocation(line: 60, column: 27, scope: !48) +!61 = !DILocation(line: 60, column: 30, scope: !5) +!62 = !DILocation(line: 61, column: 25, scope: !5) +!63 = !DILocation(line: 61, column: 37, scope: !5) +!64 = !DILocation(line: 61, column: 4, scope: !5) diff --git a/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..18a3ae9222737ae7ea7dd913c7f107d1a193a44a --- /dev/null +++ b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir @@ -0,0 +1,127 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked> + %cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked1> + %cst_1 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> + %cst_2 = arith.constant dense<32768> : tensor<64x1xi32, #blocked> + %cst_3 = arith.constant dense<256> : tensor<1x8xi32, #blocked> + %cst_4 = arith.constant dense<128> : tensor<1x8xi32, #blocked1> + %cst_5 = arith.constant dense<128> : tensor<1x8xi32, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c128_i32 = arith.constant 128 : i32 + %c8_i32 = arith.constant 8 : i32 + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1> + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> + %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> + %5 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked> + %6 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1> + %7 = tt.expand_dims %4 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xi32, #blocked2> + %8 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked> + %9 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1> + %10 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked2> + %11 = arith.addi %8, %5 : tensor<64x1xi32, #blocked> + %12 = arith.addi %9, %6 : tensor<64x1xi32, #blocked1> + %13 = arith.addi %10, %7 : tensor<64x1xi32, #blocked2> + %14 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %15 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %16 = tt.expand_dims %14 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x8xi32, #blocked1> + %17 = tt.expand_dims %15 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked> + %18 = arith.remsi %11, %cst : tensor<64x1xi32, #blocked> + %19 = arith.divsi %11, %cst : tensor<64x1xi32, #blocked> + %20 = arith.divsi %12, %cst_0 : tensor<64x1xi32, #blocked1> + %21 = tt.broadcast %18 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %22 = arith.muli %19, %cst_2 : tensor<64x1xi32, #blocked> + %23 = tt.broadcast %22 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %24 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %25 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %26 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %27 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked1> + %28 = tt.broadcast %27 : (tensor<64x1xi32, #blocked1>) -> tensor<64x8xi32, #blocked1> + %29 = tt.splat %arg3 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked1> + %30 = tt.splat %arg4 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked1> + %31 = tt.splat %arg5 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %32 = tt.splat %arg6 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked1> + %33 = tt.splat %arg7 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked1> + %34:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_6, %arg14 = %cst_6) -> (tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>) : i32 { + %45 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked1> + %46 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked> + %47 = arith.addi %45, %16 : tensor<1x8xi32, #blocked1> + %48 = arith.addi %46, %17 : tensor<1x8xi32, #blocked> + %49 = arith.cmpi slt, %47, %cst_4 : tensor<1x8xi32, #blocked1> + %50 = arith.cmpi slt, %48, %cst_5 : tensor<1x8xi32, #blocked> + %51 = arith.muli %48, %cst_3 : tensor<1x8xi32, #blocked> + %52 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %53 = arith.addi %21, %52 : tensor<64x8xi32, #blocked> + %54 = arith.addi %53, %23 : tensor<64x8xi32, #blocked> + %55 = tt.addptr %24, %54 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %56 = tt.broadcast %49 : (tensor<1x8xi1, #blocked1>) -> tensor<64x8xi1, #blocked1> + %57 = tt.broadcast %50 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked> + %58 = tt.load %55, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked> + %59 = triton_gpu.convert_layout %58 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1> + %60 = arith.extf %59 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> + %61 = tt.addptr %25, %54 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %62 = tt.load %61, %57, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked> + %63 = triton_gpu.convert_layout %62 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1> + %64 = tt.addptr %26, %54 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %65 = tt.load %64, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked> + %66 = arith.extf %65 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> + %67 = tt.broadcast %47 : (tensor<1x8xi32, #blocked1>) -> tensor<64x8xi32, #blocked1> + %68 = arith.addi %67, %28 : tensor<64x8xi32, #blocked1> + %69 = tt.addptr %29, %68 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> + %70 = tt.load %69, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1> + %71 = tt.addptr %30, %68 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> + %72 = tt.load %71, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1> + %73 = tt.addptr %31, %54 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %74 = tt.load %73, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked> + %75 = triton_gpu.convert_layout %74 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1> + %76 = arith.extf %75 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> + %77 = tt.addptr %32, %68 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> + %78 = tt.load %77, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1> + %79 = tt.addptr %33, %68 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> + %80 = tt.load %79, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1> + %81 = arith.addf %62, %66 : tensor<64x8xf32, #blocked> + %82 = triton_gpu.convert_layout %81 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1> + %83 = arith.subf %82, %70 : tensor<64x8xf32, #blocked1> + %84 = arith.mulf %83, %72 : tensor<64x8xf32, #blocked1> + %85 = arith.mulf %60, %84 : tensor<64x8xf32, #blocked1> + %86 = arith.addf %arg13, %85 : tensor<64x8xf32, #blocked1> + %87 = arith.select %56, %86, %arg13 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1> + %88 = arith.subf %63, %78 : tensor<64x8xf32, #blocked1> + %89 = arith.mulf %88, %80 : tensor<64x8xf32, #blocked1> + %90 = arith.mulf %76, %89 : tensor<64x8xf32, #blocked1> + %91 = arith.addf %arg14, %90 : tensor<64x8xf32, #blocked1> + %92 = arith.select %56, %91, %arg14 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1> + scf.yield %87, %92 : tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1> + } + %35 = "tt.reduce"(%34#0) <{axis = 1 : i32}> ({ + ^bb0(%arg12: f32, %arg13: f32): + %45 = arith.addf %arg12, %arg13 : f32 + tt.reduce.return %45 : f32 + }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %36 = triton_gpu.convert_layout %35 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> + %37 = tt.expand_dims %36 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2> + %38 = tt.splat %arg8 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked2> + %39 = tt.addptr %38, %13 : tensor<64x1x!tt.ptr, #blocked2>, tensor<64x1xi32, #blocked2> + tt.store %39, %37 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2> + %40 = "tt.reduce"(%34#1) <{axis = 1 : i32}> ({ + ^bb0(%arg12: f32, %arg13: f32): + %45 = arith.addf %arg12, %arg13 : f32 + tt.reduce.return %45 : f32 + }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %41 = triton_gpu.convert_layout %40 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> + %42 = tt.expand_dims %41 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2> + %43 = tt.splat %arg9 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked2> + %44 = tt.addptr %43, %13 : tensor<64x1x!tt.ptr, #blocked2>, tensor<64x1xi32, #blocked2> + tt.store %44, %42 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2> + tt.return + } +} diff --git a/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir b/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..984c6333935b33b6c2b6eecbe07e5377b90d7a29 --- /dev/null +++ b/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir @@ -0,0 +1,333 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %10 = and i32 %9, 31, !dbg !10 + %11 = lshr i32 %9, 5, !dbg !10 + %12 = and i32 %11, 1, !dbg !10 + %urem = shl i32 %9, 2, !dbg !10 + %13 = and i32 %urem, 252, !dbg !10 + %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11 + %15 = shl i32 %14, 8, !dbg !12 + %16 = or i32 %15, %13, !dbg !13 + %17 = sext i32 %16 to i64, !dbg !14 + %18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !14 + %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15 + %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !15 + %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !15 + %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !15 + %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !15 + %24 = bitcast i32 %22 to float, !dbg !15 + %25 = bitcast i32 %23 to float, !dbg !15 + %26 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !16 + %27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17 + %28 = extractvalue { i32, i32 } %27, 0, !dbg !17 + %29 = extractvalue { i32, i32 } %27, 1, !dbg !17 + %30 = trunc i32 %28 to i16, !dbg !17 + %extelt.offset = lshr i32 %28, 16, !dbg !17 + %31 = trunc i32 %extelt.offset to i16, !dbg !17 + %32 = trunc i32 %29 to i16, !dbg !17 + %extelt.offset1 = lshr i32 %29, 16, !dbg !17 + %33 = trunc i32 %extelt.offset1 to i16, !dbg !17 + %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18 + %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18 + %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18 + %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18 + %38 = getelementptr i16, ptr addrspace(1) %2, i64 %17, !dbg !19 + %39 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %38, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20 + %40 = extractvalue { i32, i32 } %39, 0, !dbg !20 + %41 = extractvalue { i32, i32 } %39, 1, !dbg !20 + %42 = trunc i32 %40 to i16, !dbg !20 + %extelt.offset2 = lshr i32 %40, 16, !dbg !20 + %43 = trunc i32 %extelt.offset2 to i16, !dbg !20 + %44 = trunc i32 %41 to i16, !dbg !20 + %extelt.offset3 = lshr i32 %41, 16, !dbg !20 + %45 = trunc i32 %extelt.offset3 to i16, !dbg !20 + %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21 + %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21 + %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21 + %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !21 + %50 = getelementptr i16, ptr addrspace(1) %3, i64 %17, !dbg !22 + %51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23 + %52 = extractvalue { i32, i32 } %51, 0, !dbg !23 + %53 = extractvalue { i32, i32 } %51, 1, !dbg !23 + %54 = trunc i32 %52 to i16, !dbg !23 + %extelt.offset4 = lshr i32 %52, 16, !dbg !23 + %55 = trunc i32 %extelt.offset4 to i16, !dbg !23 + %56 = trunc i32 %53 to i16, !dbg !23 + %extelt.offset5 = lshr i32 %53, 16, !dbg !23 + %57 = trunc i32 %extelt.offset5 to i16, !dbg !23 + %58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !24 + %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !24 + %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !24 + %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !24 + %62 = zext nneg i32 %13 to i64, !dbg !25 + %63 = getelementptr float, ptr addrspace(1) %4, i64 %62, !dbg !25 + %64 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %63, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26 + %65 = fadd float %36, %24, !dbg !27 + %66 = fadd float %37, %25, !dbg !27 + %67 = fadd float %65, %48, !dbg !28 + %68 = fadd float %66, %49, !dbg !28 + %69 = insertelement <2 x i32> poison, i32 %20, i64 0, !dbg !15 + %70 = insertelement <2 x i32> %69, i32 %21, i64 1, !dbg !15 + %71 = bitcast <2 x i32> %70 to <2 x float>, !dbg !15 + %72 = insertelement <2 x float> poison, float %34, i64 0, !dbg !27 + %73 = insertelement <2 x float> %72, float %35, i64 1, !dbg !27 + %74 = fadd <2 x float> %73, %71, !dbg !27 + %75 = insertelement <2 x float> poison, float %46, i64 0, !dbg !28 + %76 = insertelement <2 x float> %75, float %47, i64 1, !dbg !28 + %77 = fadd <2 x float> %74, %76, !dbg !28 + %78 = insertelement <2 x float> poison, float %58, i64 0, !dbg !29 + %79 = insertelement <2 x float> %78, float %59, i64 1, !dbg !29 + %80 = fadd <2 x float> %77, %79, !dbg !29 + %81 = fadd float %67, %60, !dbg !29 + %82 = fadd float %68, %61, !dbg !29 + %83 = extractelement <2 x float> %80, i64 0, !dbg !30 + %84 = extractelement <2 x float> %80, i64 1, !dbg !30 + %85 = fadd float %83, %84, !dbg !30 + %86 = fadd float %85, %81, !dbg !30 + %87 = fadd float %86, %82, !dbg !30 + %88 = bitcast float %87 to i32, !dbg !36 + %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 16, i32 31), !dbg !36 + %90 = bitcast i32 %89 to float, !dbg !36 + %91 = fadd float %87, %90, !dbg !30 + %92 = bitcast float %91 to i32, !dbg !36 + %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 8, i32 31), !dbg !36 + %94 = bitcast i32 %93 to float, !dbg !36 + %95 = fadd float %91, %94, !dbg !30 + %96 = bitcast float %95 to i32, !dbg !36 + %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 4, i32 31), !dbg !36 + %98 = bitcast i32 %97 to float, !dbg !36 + %99 = fadd float %95, %98, !dbg !30 + %100 = bitcast float %99 to i32, !dbg !36 + %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 2, i32 31), !dbg !36 + %102 = bitcast i32 %101 to float, !dbg !36 + %103 = fadd float %99, %102, !dbg !30 + %104 = bitcast float %103 to i32, !dbg !36 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 1, i32 31), !dbg !36 + %106 = bitcast i32 %105 to float, !dbg !36 + %107 = fadd float %103, %106, !dbg !30 + %108 = icmp eq i32 %10, 0, !dbg !36 + %109 = zext nneg i32 %12 to i64, !dbg !36 + %110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !36 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %107, i1 %108) #6, !dbg !36 + tail call void @llvm.nvvm.barrier0(), !dbg !36 + %111 = icmp slt i32 %9, 2, !dbg !36 + %112 = sext i32 %9 to i64, !dbg !36 + %113 = getelementptr float, ptr addrspace(3) @global_smem, i64 %112, !dbg !36 + %114 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !36 + %115 = bitcast float %114 to i32, !dbg !36 + %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 1, i32 31), !dbg !36 + %117 = bitcast i32 %116 to float, !dbg !36 + %118 = fadd float %114, %117, !dbg !30 + %119 = and i32 %9, 1, !dbg !36 + %120 = icmp eq i32 %119, 0, !dbg !36 + %121 = and i1 %111, %120, !dbg !36 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %118, i1 %121) #6, !dbg !36 + tail call void @llvm.nvvm.barrier0(), !dbg !36 + %122 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !36 + %123 = fadd float %122, 0.000000e+00, !dbg !38 + %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %123, float 2.560000e+02) #6, !dbg !42 + %125 = fsub float %83, %124, !dbg !43 + %126 = fsub float %84, %124, !dbg !43 + %127 = fsub float %81, %124, !dbg !43 + %128 = fsub float %82, %124, !dbg !43 + %129 = fmul float %125, %125, !dbg !44 + %130 = fmul float %126, %126, !dbg !44 + %131 = fmul float %127, %127, !dbg !44 + %132 = fmul float %128, %128, !dbg !44 + tail call void @llvm.nvvm.barrier0(), !dbg !45 + %133 = fadd float %129, %130, !dbg !47 + %134 = fadd float %131, %133, !dbg !47 + %135 = fadd float %132, %134, !dbg !47 + %136 = bitcast float %135 to i32, !dbg !45 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !45 + %138 = bitcast i32 %137 to float, !dbg !45 + %139 = fadd float %135, %138, !dbg !47 + %140 = bitcast float %139 to i32, !dbg !45 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !45 + %142 = bitcast i32 %141 to float, !dbg !45 + %143 = fadd float %139, %142, !dbg !47 + %144 = bitcast float %143 to i32, !dbg !45 + %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !45 + %146 = bitcast i32 %145 to float, !dbg !45 + %147 = fadd float %143, %146, !dbg !47 + %148 = bitcast float %147 to i32, !dbg !45 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !45 + %150 = bitcast i32 %149 to float, !dbg !45 + %151 = fadd float %147, %150, !dbg !47 + %152 = bitcast float %151 to i32, !dbg !45 + %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !45 + %154 = bitcast i32 %153 to float, !dbg !45 + %155 = fadd float %151, %154, !dbg !47 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %155, i1 %108) #6, !dbg !45 + tail call void @llvm.nvvm.barrier0(), !dbg !45 + %156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !45 + %157 = bitcast float %156 to i32, !dbg !45 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !45 + %159 = bitcast i32 %158 to float, !dbg !45 + %160 = fadd float %156, %159, !dbg !47 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %160, i1 %121) #6, !dbg !45 + tail call void @llvm.nvvm.barrier0(), !dbg !45 + %161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !45 + %162 = fadd float %161, 0.000000e+00, !dbg !50 + %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float 2.560000e+02) #6, !dbg !52 + %164 = fadd float %163, 0x3EE4F8B580000000, !dbg !53 + %165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !54 + %.not.i = icmp eq i32 %165, 0, !dbg !54 + br i1 %.not.i, label %168, label %166, !dbg !54 + +166: ; preds = %8 + %167 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %164), !dbg !54 + br label %__nv_rsqrtf.exit, !dbg !54 + +168: ; preds = %8 + %169 = tail call float @llvm.nvvm.rsqrt.approx.f(float %164), !dbg !54 + br label %__nv_rsqrtf.exit, !dbg !54 + +__nv_rsqrtf.exit: ; preds = %166, %168 + %.0.i = phi float [ %167, %166 ], [ %169, %168 ], !dbg !54 + %170 = extractvalue { i32, i32, i32, i32 } %64, 3, !dbg !26 + %171 = bitcast i32 %170 to float, !dbg !26 + %172 = extractvalue { i32, i32, i32, i32 } %64, 2, !dbg !26 + %173 = bitcast i32 %172 to float, !dbg !26 + %174 = extractvalue { i32, i32, i32, i32 } %64, 1, !dbg !26 + %175 = bitcast i32 %174 to float, !dbg !26 + %176 = extractvalue { i32, i32, i32, i32 } %64, 0, !dbg !26 + %177 = bitcast i32 %176 to float, !dbg !26 + %178 = fmul float %125, %.0.i, !dbg !55 + %179 = fmul float %126, %.0.i, !dbg !55 + %180 = fmul float %127, %.0.i, !dbg !55 + %181 = fmul float %128, %.0.i, !dbg !55 + %182 = fmul float %178, %177, !dbg !56 + %183 = fmul float %179, %175, !dbg !56 + %184 = fmul float %180, %173, !dbg !56 + %185 = fmul float %181, %171, !dbg !56 + %186 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !57 + %187 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %182) #6, !dbg !58 + %188 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %183) #6, !dbg !58 + %189 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %184) #6, !dbg !58 + %190 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %185) #6, !dbg !58 + %191 = insertelement <2 x i16> undef, i16 %187, i64 0, !dbg !58 + %192 = insertelement <2 x i16> %191, i16 %188, i64 1, !dbg !58 + %193 = bitcast <2 x i16> %192 to i32, !dbg !58 + %194 = insertelement <2 x i16> undef, i16 %189, i64 0, !dbg !58 + %195 = insertelement <2 x i16> %194, i16 %190, i64 1, !dbg !58 + %196 = bitcast <2 x i16> %195 to i32, !dbg !58 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %193, i32 %196, ptr addrspace(1) %186, i1 true) #6, !dbg !58 + ret void, !dbg !59 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { + %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 + %.not = icmp eq i32 %1, 0 + br i1 %.not, label %4, label %2 + +2: ; preds = %0 + %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) + br label %6 + +4: ; preds = %0 + %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) + br label %6 + +6: ; preds = %4, %2 + %.0 = phi float [ %3, %2 ], [ %5, %4 ] + ret float %.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #5 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "c4qmi2qsgi5mnuig7w3wx5jmjnmvktjlgcv4c6q7w2vaw3bk6qzb.py", directory: "/tmp/torchinductor_root/4q") +!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 26, column: 26, scope: !7) +!11 = !DILocation(line: 23, column: 28, scope: !7) +!12 = !DILocation(line: 30, column: 40, scope: !7) +!13 = !DILocation(line: 30, column: 36, scope: !7) +!14 = !DILocation(line: 30, column: 30, scope: !7) +!15 = !DILocation(line: 30, column: 46, scope: !7) +!16 = !DILocation(line: 31, column: 30, scope: !7) +!17 = !DILocation(line: 31, column: 46, scope: !7) +!18 = !DILocation(line: 31, column: 67, scope: !7) +!19 = !DILocation(line: 32, column: 30, scope: !7) +!20 = !DILocation(line: 32, column: 46, scope: !7) +!21 = !DILocation(line: 32, column: 67, scope: !7) +!22 = !DILocation(line: 33, column: 30, scope: !7) +!23 = !DILocation(line: 33, column: 46, scope: !7) +!24 = !DILocation(line: 33, column: 67, scope: !7) +!25 = !DILocation(line: 34, column: 31, scope: !7) +!26 = !DILocation(line: 34, column: 36, scope: !7) +!27 = !DILocation(line: 36, column: 18, scope: !7) +!28 = !DILocation(line: 38, column: 18, scope: !7) +!29 = !DILocation(line: 40, column: 18, scope: !7) +!30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !34) +!31 = distinct !DILexicalBlockFile(scope: !33, file: !32, discriminator: 0) +!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!33 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0) +!34 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !35) +!35 = !DILocation(line: 45, column: 59, scope: !31) +!36 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !37) +!37 = !DILocation(line: 45, column: 59, scope: !33) +!38 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !41) +!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0) +!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!41 = !DILocation(line: 45, column: 45, scope: !39) +!42 = !DILocation(line: 48, column: 20, scope: !7) +!43 = !DILocation(line: 49, column: 20, scope: !7) +!44 = !DILocation(line: 50, column: 20, scope: !7) +!45 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !46) +!46 = !DILocation(line: 53, column: 59, scope: !33) +!47 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !48) +!48 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !49) +!49 = !DILocation(line: 53, column: 59, scope: !31) +!50 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !51) +!51 = !DILocation(line: 53, column: 45, scope: !39) +!52 = !DILocation(line: 56, column: 20, scope: !7) +!53 = !DILocation(line: 58, column: 20, scope: !7) +!54 = !DILocation(line: 59, column: 26, scope: !7) +!55 = !DILocation(line: 60, column: 20, scope: !7) +!56 = !DILocation(line: 61, column: 20, scope: !7) +!57 = !DILocation(line: 63, column: 25, scope: !7) +!58 = !DILocation(line: 63, column: 48, scope: !7) +!59 = !DILocation(line: 63, column: 4, scope: !7) diff --git a/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.cubin b/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..fa6eac07b78e8f52e20628595ec10c06d7358457 Binary files /dev/null and b/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.cubin differ diff --git a/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.llir b/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..aab6e43e09a606798af3f019df529172c5e95a26 --- /dev/null +++ b/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.llir @@ -0,0 +1,230 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = and i32 %6, 31, !dbg !8 + %8 = lshr i32 %6, 5, !dbg !8 + %9 = shl i32 %6, 2, !dbg !8 + %10 = and i32 %9, 60, !dbg !8 + %11 = and i32 %8, 3, !dbg !9 + %12 = lshr i32 %7, 4, !dbg !9 + %13 = shl nuw nsw i32 %11, 1, !dbg !9 + %14 = or i32 %13, %12, !dbg !9 + %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 + %16 = shl i32 %15, 6, !dbg !11 + %17 = or i32 %16, %10, !dbg !12 + br label %18, !dbg !13 + +18: ; preds = %5, %18 + %19 = phi i32 [ 0, %5 ], [ %37, %18 ] + %20 = phi <4 x float> [ zeroinitializer, %5 ], [ %36, %18 ] + %21 = or i32 %19, %14, !dbg !14 + %22 = shl i32 %21, 17, !dbg !15 + %23 = add i32 %17, %22, !dbg !16 + %24 = sext i32 %23 to i64, !dbg !17 + %25 = getelementptr float, ptr addrspace(1) %0, i64 %24, !dbg !17 + %26 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18 + %27 = extractvalue { i32, i32, i32, i32 } %26, 0, !dbg !18 + %28 = extractvalue { i32, i32, i32, i32 } %26, 1, !dbg !18 + %29 = extractvalue { i32, i32, i32, i32 } %26, 2, !dbg !18 + %30 = extractvalue { i32, i32, i32, i32 } %26, 3, !dbg !18 + %31 = insertelement <4 x i32> poison, i32 %27, i64 0, !dbg !18 + %32 = insertelement <4 x i32> %31, i32 %28, i64 1, !dbg !18 + %33 = insertelement <4 x i32> %32, i32 %29, i64 2, !dbg !18 + %34 = insertelement <4 x i32> %33, i32 %30, i64 3, !dbg !18 + %35 = bitcast <4 x i32> %34 to <4 x float>, !dbg !18 + %36 = fadd <4 x float> %20, %35, !dbg !19 + %37 = add nuw nsw i32 %19, 8, !dbg !13 + %38 = icmp ult i32 %19, 112, !dbg !13 + br i1 %38, label %18, label %39, !dbg !13 + +39: ; preds = %18 + %40 = and i32 %6, 63, !dbg !8 + %41 = or i32 %16, %40, !dbg !12 + %42 = or i32 %10, 3, !dbg !20 + %43 = or i32 %10, 2, !dbg !20 + %44 = or i32 %10, 1, !dbg !20 + %45 = extractelement <4 x float> %36, i64 0, !dbg !20 + %46 = bitcast float %45 to i32, !dbg !20 + %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !20 + %48 = bitcast i32 %47 to float, !dbg !20 + %49 = fadd float %45, %48, !dbg !24 + %50 = extractelement <4 x float> %36, i64 1, !dbg !20 + %51 = bitcast float %50 to i32, !dbg !20 + %52 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %51, i32 16, i32 31), !dbg !20 + %53 = bitcast i32 %52 to float, !dbg !20 + %54 = fadd float %50, %53, !dbg !24 + %55 = extractelement <4 x float> %36, i64 2, !dbg !20 + %56 = bitcast float %55 to i32, !dbg !20 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 16, i32 31), !dbg !20 + %58 = bitcast i32 %57 to float, !dbg !20 + %59 = fadd float %55, %58, !dbg !24 + %60 = extractelement <4 x float> %36, i64 3, !dbg !20 + %61 = bitcast float %60 to i32, !dbg !20 + %62 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %61, i32 16, i32 31), !dbg !20 + %63 = bitcast i32 %62 to float, !dbg !20 + %64 = fadd float %60, %63, !dbg !24 + %65 = icmp ult i32 %7, 16, !dbg !20 + %66 = shl nuw nsw i32 %10, 2, !dbg !20 + %67 = or i32 %66, %11, !dbg !20 + %68 = zext nneg i32 %67 to i64, !dbg !20 + %69 = getelementptr float, ptr addrspace(3) @global_smem, i64 %68, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %69, float %49, i1 %65) #3, !dbg !20 + %70 = shl nuw nsw i32 %44, 2, !dbg !20 + %71 = or i32 %70, %11, !dbg !20 + %72 = zext nneg i32 %71 to i64, !dbg !20 + %73 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %54, i1 %65) #3, !dbg !20 + %74 = shl nuw nsw i32 %43, 2, !dbg !20 + %75 = or i32 %74, %11, !dbg !20 + %76 = zext nneg i32 %75 to i64, !dbg !20 + %77 = getelementptr float, ptr addrspace(3) @global_smem, i64 %76, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, float %59, i1 %65) #3, !dbg !20 + %78 = shl nuw nsw i32 %42, 2, !dbg !20 + %79 = or i32 %78, %11, !dbg !20 + %80 = zext nneg i32 %79 to i64, !dbg !20 + %81 = getelementptr float, ptr addrspace(3) @global_smem, i64 %80, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %81, float %64, i1 %65) #3, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !20 + %82 = icmp slt i32 %6, 256, !dbg !20 + %83 = sext i32 %6 to i64, !dbg !20 + %84 = getelementptr float, ptr addrspace(3) @global_smem, i64 %83, !dbg !20 + %85 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %84, i1 %82) #3, !dbg !20 + %86 = bitcast float %85 to i32, !dbg !20 + %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 2, i32 31), !dbg !20 + %88 = bitcast i32 %87 to float, !dbg !20 + %89 = fadd float %85, %88, !dbg !24 + %90 = bitcast float %89 to i32, !dbg !20 + %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 1, i32 31), !dbg !20 + %92 = bitcast i32 %91 to float, !dbg !20 + %93 = fadd float %89, %92, !dbg !24 + %94 = and i32 %6, 3, !dbg !20 + %95 = icmp eq i32 %94, 0, !dbg !20 + %96 = and i1 %82, %95, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, float %93, i1 %96) #3, !dbg !20 + %97 = add i32 %6, 128, !dbg !20 + %98 = sext i32 %97 to i64, !dbg !20 + %99 = getelementptr float, ptr addrspace(3) @global_smem, i64 %98, !dbg !20 + %100 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %99, i1 %82) #3, !dbg !20 + %101 = bitcast float %100 to i32, !dbg !20 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 2, i32 31), !dbg !20 + %103 = bitcast i32 %102 to float, !dbg !20 + %104 = fadd float %100, %103, !dbg !24 + %105 = bitcast float %104 to i32, !dbg !20 + %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 1, i32 31), !dbg !20 + %107 = bitcast i32 %106 to float, !dbg !20 + %108 = fadd float %104, %107, !dbg !24 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %99, float %108, i1 %96) #3, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !20 + %109 = zext nneg i32 %66 to i64, !dbg !20 + %110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !20 + %111 = load float, ptr addrspace(3) %110, align 4, !dbg !20 + %112 = zext nneg i32 %70 to i64, !dbg !20 + %113 = getelementptr float, ptr addrspace(3) @global_smem, i64 %112, !dbg !20 + %114 = load float, ptr addrspace(3) %113, align 4, !dbg !20 + %115 = zext nneg i32 %74 to i64, !dbg !20 + %116 = getelementptr float, ptr addrspace(3) @global_smem, i64 %115, !dbg !20 + %117 = load float, ptr addrspace(3) %116, align 4, !dbg !20 + %118 = zext nneg i32 %78 to i64, !dbg !20 + %119 = getelementptr float, ptr addrspace(3) @global_smem, i64 %118, !dbg !20 + %120 = load float, ptr addrspace(3) %119, align 4, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %121 = zext nneg i32 %10 to i64, !dbg !28 + %122 = getelementptr float, ptr addrspace(3) @global_smem, i64 %121, !dbg !28 + %123 = insertelement <1 x float> undef, float %111, i64 0, !dbg !28 + store <1 x float> %123, ptr addrspace(3) %122, align 4, !dbg !28 + %124 = zext nneg i32 %44 to i64, !dbg !28 + %125 = getelementptr float, ptr addrspace(3) @global_smem, i64 %124, !dbg !28 + %126 = insertelement <1 x float> undef, float %114, i64 0, !dbg !28 + store <1 x float> %126, ptr addrspace(3) %125, align 4, !dbg !28 + %127 = zext nneg i32 %43 to i64, !dbg !28 + %128 = getelementptr float, ptr addrspace(3) @global_smem, i64 %127, !dbg !28 + %129 = insertelement <1 x float> undef, float %117, i64 0, !dbg !28 + store <1 x float> %129, ptr addrspace(3) %128, align 4, !dbg !28 + %130 = zext nneg i32 %42 to i64, !dbg !28 + %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !28 + %132 = insertelement <1 x float> undef, float %120, i64 0, !dbg !28 + store <1 x float> %132, ptr addrspace(3) %131, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %133 = zext nneg i32 %40 to i64, !dbg !28 + %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !28 + %135 = load <1 x float>, ptr addrspace(3) %134, align 4, !dbg !28 + %.frozen = freeze i32 %41 + %136 = sdiv i32 %.frozen, 256, !dbg !29 + %137 = mul i32 %136, 256 + %.decomposed = sub i32 %.frozen, %137 + %138 = sext i32 %136 to i64, !dbg !30 + %139 = getelementptr i64, ptr addrspace(1) %1, i64 %138, !dbg !30 + %140 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %139, i1 true) #3, !dbg !31 + %141 = lshr i64 %140, 54, !dbg !32 + %142 = and i64 %141, 512, !dbg !32 + %143 = add i64 %142, %140, !dbg !32 + %144 = shl i64 %143, 8, !dbg !33 + %145 = sext i32 %.decomposed to i64, !dbg !34 + %146 = getelementptr float, ptr addrspace(1) %2, i64 %144, !dbg !35 + %147 = getelementptr float, ptr addrspace(1) %146, i64 %145, !dbg !35 + %148 = and i32 %6, 64, !dbg !36 + %149 = icmp eq i32 %148, 0, !dbg !36 + %150 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %147, <1 x float> %135, i1 %149) #3, !dbg !36 + ret void, !dbg !37 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i") +!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 21, column: 28, scope: !5) +!11 = !DILocation(line: 21, column: 33, scope: !5) +!12 = !DILocation(line: 22, column: 23, scope: !5) +!13 = !DILocation(line: 27, column: 36, scope: !5) +!14 = !DILocation(line: 28, column: 27, scope: !5) +!15 = !DILocation(line: 31, column: 47, scope: !5) +!16 = !DILocation(line: 31, column: 40, scope: !5) +!17 = !DILocation(line: 31, column: 34, scope: !5) +!18 = !DILocation(line: 31, column: 53, scope: !5) +!19 = !DILocation(line: 34, column: 38, scope: !5) +!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0) +!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!23 = !DILocation(line: 35, column: 25, scope: !21) +!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26) +!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0) +!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27) +!27 = !DILocation(line: 35, column: 25, scope: !25) +!28 = !DILocation(line: 35, column: 28, scope: !5) +!29 = !DILocation(line: 36, column: 20, scope: !5) +!30 = !DILocation(line: 38, column: 30, scope: !5) +!31 = !DILocation(line: 38, column: 35, scope: !5) +!32 = !DILocation(line: 41, column: 32, scope: !5) +!33 = !DILocation(line: 45, column: 40, scope: !5) +!34 = !DILocation(line: 45, column: 36, scope: !5) +!35 = !DILocation(line: 45, column: 30, scope: !5) +!36 = !DILocation(line: 45, column: 55, scope: !5) +!37 = !DILocation(line: 45, column: 4, scope: !5) diff --git a/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ptx b/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..fef8e3f747ee60b76d2d6b42b03dd957c4b44eaf --- /dev/null +++ b/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ptx @@ -0,0 +1,296 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u32 triton__0d1d2de_param_2 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<3>; + .reg .b16 %rs<3>; + .reg .b32 %r<12>; + .reg .b64 %rd<7>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1d2de_param_0]; + ld.param.u64 %rd4, [triton__0d1d2de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 1; + and.b32 %r9, %r8, 254; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r10, %r1, 8; + .loc 1 21 23 + or.b32 %r11, %r10, %r9; + .loc 1 24 30 + mul.wide.s32 %rd5, %r11, 2; + add.s64 %rd1, %rd3, %rd5; + mov.pred %p1, -1; + .loc 1 24 35 + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ]; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + .loc 1 24 44 + cvt.f32.bf16 %r5, %rs1; + cvt.f32.bf16 %r6, %rs2; + .loc 1 26 25 + mul.wide.s32 %rd6, %r11, 4; + add.s64 %rd2, %rd4, %rd6; + .loc 1 26 36 + @%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 }; + .loc 1 26 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/zl/czl6nmwasl7k4ic55xowihczcooh3mhu5v6ls6w2xzqqocdc2da7.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 122 +.b8 108 +.b8 54 +.b8 110 +.b8 109 +.b8 119 +.b8 97 +.b8 115 +.b8 108 +.b8 55 +.b8 107 +.b8 52 +.b8 105 +.b8 99 +.b8 53 +.b8 53 +.b8 120 +.b8 111 +.b8 119 +.b8 105 +.b8 104 +.b8 99 +.b8 122 +.b8 99 +.b8 111 +.b8 111 +.b8 104 +.b8 51 +.b8 109 +.b8 104 +.b8 117 +.b8 53 +.b8 118 +.b8 54 +.b8 108 +.b8 115 +.b8 54 +.b8 119 +.b8 50 +.b8 120 +.b8 122 +.b8 113 +.b8 113 +.b8 111 +.b8 99 +.b8 100 +.b8 99 +.b8 50 +.b8 100 +.b8 97 +.b8 55 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 122 +.b8 108 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ptx b/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..4be8d6c6a891dc7cc8dea35d392e86cc997c3c74 --- /dev/null +++ b/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ptx @@ -0,0 +1,777 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7d8de9de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3d4d5d6d7d8de9de( + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7, + .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8, + .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<40>; + .reg .b16 %rs<13>; + .reg .b32 %r<118>; + .reg .f32 %f<94>; + .reg .b64 %rd<28>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_0]; + ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7d8de9de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r84, %tid.x; + and.b32 %r85, %r84, 31; + ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7d8de9de_param_2]; + ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7d8de9de_param_3]; + ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7d8de9de_param_4]; + shl.b32 %r86, %r84, 2; + ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8de9de_param_5]; + and.b32 %r87, %r86, 252; + ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8de9de_param_6]; + ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7d8de9de_param_7]; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r88, %r1, 8; + .loc 1 30 36 + or.b32 %r89, %r88, %r87; + .loc 1 30 30 + mul.wide.s32 %rd24, %r89, 2; + add.s64 %rd1, %rd17, %rd24; + mov.b32 %r4, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r4; + @!%p1 mov.u32 %r3, %r4; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + .loc 1 30 67 + cvt.f32.bf16 %r6, %rs1; + mov.b32 %f1, %r6; + cvt.f32.bf16 %r7, %rs2; + mov.b32 %f2, %r7; + cvt.f32.bf16 %r8, %rs3; + mov.b32 %f3, %r8; + cvt.f32.bf16 %r9, %rs4; + mov.b32 %f4, %r9; + .loc 1 31 30 + mul.wide.u32 %rd25, %r87, 4; + add.s64 %rd2, %rd18, %rd25; + .loc 1 31 35 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r4; + @!%p1 mov.u32 %r11, %r4; + @!%p1 mov.u32 %r12, %r4; + @!%p1 mov.u32 %r13, %r4; + mov.b32 %f5, %r10; + mov.b32 %f6, %r11; + mov.b32 %f7, %r12; + mov.b32 %f8, %r13; + .loc 1 32 30 + mul.wide.s32 %rd26, %r89, 4; + add.s64 %rd3, %rd19, %rd26; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r4; + @!%p1 mov.u32 %r19, %r4; + @!%p1 mov.u32 %r20, %r4; + @!%p1 mov.u32 %r21, %r4; + mov.b32 %f9, %r18; + mov.b32 %f10, %r19; + mov.b32 %f11, %r20; + mov.b32 %f12, %r21; + .loc 1 33 30 + add.s64 %rd4, %rd20, %rd24; + .loc 1 33 46 + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + @%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r26, %r4; + @!%p1 mov.u32 %r27, %r4; + cvt.u16.u32 %rs5, %r26; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r26; } + cvt.u16.u32 %rs7, %r27; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } + .loc 1 33 67 + cvt.f32.bf16 %r30, %rs5; + mov.b32 %f13, %r30; + cvt.f32.bf16 %r31, %rs6; + mov.b32 %f14, %r31; + cvt.f32.bf16 %r32, %rs7; + mov.b32 %f15, %r32; + cvt.f32.bf16 %r33, %rs8; + mov.b32 %f16, %r33; + .loc 1 34 31 + mul.wide.s32 %rd27, %r1, 4; + add.s64 %rd5, %rd21, %rd27; + .loc 1 34 36 + mov.u32 %r34, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r34 }, [ %rd5 + 0 ]; + mov.b32 %f17, %r34; + mov.u32 %r35, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r35 }, [ %rd5 + 0 ]; + mov.u32 %r36, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r36 }, [ %rd5 + 0 ]; + mov.u32 %r37, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r37 }, [ %rd5 + 0 ]; + .loc 1 35 31 + add.s64 %rd9, %rd22, %rd27; + .loc 1 35 36 + mov.u32 %r63, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r63 }, [ %rd9 + 0 ]; + mov.b32 %f18, %r63; + mov.u32 %r39, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r39 }, [ %rd9 + 0 ]; + mov.u32 %r40, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r40 }, [ %rd9 + 0 ]; + mov.u32 %r41, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r41 }, [ %rd9 + 0 ]; + .loc 1 36 35 + add.s64 %rd13, %rd16, %rd26; + .loc 1 36 51 + mov.u32 %r42, 0x0; + mov.u32 %r43, 0x0; + mov.u32 %r44, 0x0; + mov.u32 %r45, 0x0; + @%p1 ld.global.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd13 + 0 ]; + @!%p1 mov.u32 %r42, %r4; + @!%p1 mov.u32 %r43, %r4; + @!%p1 mov.u32 %r44, %r4; + @!%p1 mov.u32 %r45, %r4; + mov.b32 %f19, %r42; + mov.b32 %f20, %r43; + mov.b32 %f21, %r44; + mov.b32 %f22, %r45; + .loc 1 38 18 + mul.f32 %f23, %f1, %f5; + mul.f32 %f24, %f2, %f6; + mul.f32 %f25, %f3, %f7; + mul.f32 %f26, %f4, %f8; +$L__tmp1: + .loc 2 233 15 + fma.rn.f32 %f27, %f1, %f5, %f24; + fma.rn.f32 %f28, %f3, %f7, %f27; + fma.rn.f32 %f29, %f4, %f8, %f28; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r90, %f29; + shfl.sync.bfly.b32 %r91, %r90, 16, 31, -1; + mov.b32 %f30, %r91; +$L__tmp3: + .loc 2 233 15 + add.f32 %f31, %f29, %f30; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r92, %f31; + shfl.sync.bfly.b32 %r93, %r92, 8, 31, -1; + mov.b32 %f32, %r93; +$L__tmp5: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r94, %f33; + shfl.sync.bfly.b32 %r95, %r94, 4, 31, -1; + mov.b32 %f34, %r95; +$L__tmp7: + .loc 2 233 15 + add.f32 %f35, %f33, %f34; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r96, %f35; + shfl.sync.bfly.b32 %r97, %r96, 2, 31, -1; + mov.b32 %f36, %r97; +$L__tmp9: + .loc 2 233 15 + add.f32 %f37, %f35, %f36; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r98, %f37; + shfl.sync.bfly.b32 %r99, %r98, 1, 31, -1; + mov.b32 %f38, %r99; +$L__tmp11: + .loc 2 233 15 + add.f32 %f39, %f37, %f38; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p30, %r85, 0; + shr.u32 %r100, %r84, 3; + and.b32 %r101, %r100, 4; + mov.u32 %r102, global_smem; + add.s32 %r50, %r102, %r101; + mov.b32 %r51, %f39; + @%p30 st.shared.b32 [ %r50 + 0 ], %r51; + bar.sync 0; + setp.lt.s32 %p31, %r84, 2; + add.s32 %r53, %r102, %r86; + @%p31 ld.shared.b32 %r52, [ %r53 + 0 ]; + mov.b32 %f40, %r52; + shfl.sync.bfly.b32 %r103, %r52, 1, 31, -1; + mov.b32 %f41, %r103; +$L__tmp13: + .loc 2 233 15 + add.f32 %f42, %f40, %f41; +$L__tmp14: + .loc 2 243 36 + and.b32 %r104, %r84, 1; + setp.eq.b32 %p38, %r104, 1; + not.pred %p39, %p38; + and.pred %p32, %p31, %p39; + mov.b32 %r55, %f42; + @%p32 st.shared.b32 [ %r53 + 0 ], %r55; + bar.sync 0; + ld.shared.f32 %f43, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f44, %f43, 0f00000000; +$L__tmp16: + .loc 1 43 19 + add.f32 %f45, %f13, %f9; + add.f32 %f46, %f14, %f10; + add.f32 %f47, %f15, %f11; + add.f32 %f48, %f16, %f12; + .loc 1 44 20 + sub.f32 %f49, %f45, %f17; + sub.f32 %f50, %f46, %f17; + sub.f32 %f51, %f47, %f17; + sub.f32 %f52, %f48, %f17; + .loc 1 45 20 + mul.f32 %f53, %f49, %f18; + mul.f32 %f54, %f50, %f18; + mul.f32 %f55, %f51, %f18; + mul.f32 %f56, %f52, %f18; + .loc 1 46 19 + mul.f32 %f57, %f24, %f54; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f58, %f23, %f53, %f57; + fma.rn.f32 %f59, %f25, %f55, %f58; + fma.rn.f32 %f60, %f26, %f56, %f59; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r105, %f60; + shfl.sync.bfly.b32 %r106, %r105, 16, 31, -1; + mov.b32 %f61, %r106; +$L__tmp20: + .loc 2 233 15 + add.f32 %f62, %f60, %f61; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r107, %f62; + shfl.sync.bfly.b32 %r108, %r107, 8, 31, -1; + mov.b32 %f63, %r108; +$L__tmp22: + .loc 2 233 15 + add.f32 %f64, %f62, %f63; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r109, %f64; + shfl.sync.bfly.b32 %r110, %r109, 4, 31, -1; + mov.b32 %f65, %r110; +$L__tmp24: + .loc 2 233 15 + add.f32 %f66, %f64, %f65; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r111, %f66; + shfl.sync.bfly.b32 %r112, %r111, 2, 31, -1; + mov.b32 %f67, %r112; +$L__tmp26: + .loc 2 233 15 + add.f32 %f68, %f66, %f67; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r113, %f68; + shfl.sync.bfly.b32 %r114, %r113, 1, 31, -1; + mov.b32 %f69, %r114; +$L__tmp28: + .loc 2 233 15 + add.f32 %f70, %f68, %f69; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r57, %f70; + @%p30 st.shared.b32 [ %r50 + 0 ], %r57; + bar.sync 0; + @%p31 ld.shared.b32 %r58, [ %r53 + 0 ]; + mov.b32 %f71, %r58; + shfl.sync.bfly.b32 %r115, %r58, 1, 31, -1; + mov.b32 %f72, %r115; +$L__tmp30: + .loc 2 233 15 + add.f32 %f73, %f71, %f72; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r61, %f73; + @%p32 st.shared.b32 [ %r53 + 0 ], %r61; + bar.sync 0; + ld.shared.f32 %f74, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f75, %f74, 0f00000000; + mov.b32 %r64, 1132462080; +$L__tmp33: + .loc 1 51 20 + div.full.f32 %r62, %r63, %r64; + mov.b32 %f76, %r62; + .loc 1 53 20 + neg.f32 %f77, %f44; + fma.rn.f32 %f78, %f23, 0f43800000, %f77; + fma.rn.f32 %f79, %f24, 0f43800000, %f77; + fma.rn.f32 %f80, %f25, 0f43800000, %f77; + fma.rn.f32 %f81, %f26, 0f43800000, %f77; + .loc 1 55 20 + neg.f32 %f82, %f53; + fma.rn.f32 %f83, %f82, %f75, %f78; + neg.f32 %f84, %f54; + fma.rn.f32 %f85, %f84, %f75, %f79; + neg.f32 %f86, %f55; + fma.rn.f32 %f87, %f86, %f75, %f80; + neg.f32 %f88, %f56; + fma.rn.f32 %f89, %f88, %f75, %f81; + .loc 1 57 20 + fma.rn.f32 %f90, %f76, %f83, %f19; + fma.rn.f32 %f91, %f76, %f85, %f20; + fma.rn.f32 %f92, %f76, %f87, %f21; + fma.rn.f32 %f93, %f76, %f89, %f22; + .loc 1 59 51 + mov.b32 %r74, %f90; + mov.b32 %r75, %f91; + mov.b32 %r76, %f92; + mov.b32 %r77, %f93; + @%p1 st.global.v4.b32 [ %rd13 + 0 ], { %r74, %r75, %r76, %r77 }; + .loc 1 60 25 + add.s64 %rd15, %rd23, %rd24; + .loc 1 60 48 + cvt.rn.bf16.f32 %rs9, %r74; + cvt.rn.bf16.f32 %rs10, %r75; + cvt.rn.bf16.f32 %rs11, %r76; + cvt.rn.bf16.f32 %rs12, %r77; + mov.b32 %r116, {%rs9, %rs10}; + mov.b32 %r117, {%rs11, %rs12}; + @%p1 st.global.v2.b32 [ %rd15 + 0 ], { %r116, %r117 }; + .loc 1 60 4 + ret; +$L__tmp34: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/fh/cfhjzwujbd4bpel57x4hxw7d3m3qqfwrjg6bfe6e4wk2cyh77u45.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 407 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 102 +.b8 104 +.b8 106 +.b8 122 +.b8 119 +.b8 117 +.b8 106 +.b8 98 +.b8 100 +.b8 52 +.b8 98 +.b8 112 +.b8 101 +.b8 108 +.b8 53 +.b8 55 +.b8 120 +.b8 52 +.b8 104 +.b8 120 +.b8 119 +.b8 55 +.b8 100 +.b8 51 +.b8 109 +.b8 51 +.b8 113 +.b8 113 +.b8 102 +.b8 119 +.b8 114 +.b8 106 +.b8 103 +.b8 54 +.b8 98 +.b8 102 +.b8 101 +.b8 54 +.b8 101 +.b8 52 +.b8 119 +.b8 107 +.b8 50 +.b8 99 +.b8 121 +.b8 104 +.b8 55 +.b8 55 +.b8 117 +.b8 52 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 102 +.b8 104 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 41 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 41 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 41 +.b8 44 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 49 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 49 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 49 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 411 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 411 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ttgir b/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..c6faece66f94c74b342fe6e80cc0c5b022681e23 --- /dev/null +++ b/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ttgir @@ -0,0 +1,78 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked> + %cst_1 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %16 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %17 = tt.addptr %16, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %18 = tt.load %17, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %19 = arith.extf %18 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %20 = tt.addptr %arg5, %0 : !tt.ptr, i32 + %21 = tt.splat %20 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %23 = tt.addptr %arg6, %0 : !tt.ptr, i32 + %24 = tt.splat %23 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %26 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %27 = tt.addptr %26, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %28 = tt.load %27, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %29 = arith.mulf %9, %12 : tensor<256xf32, #blocked> + %30 = arith.select %2, %29, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %55 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %55 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %32 = arith.addf %31, %cst_1 : f32 + %33 = arith.addf %15, %19 : tensor<256xf32, #blocked> + %34 = tt.broadcast %22 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %35 = arith.subf %33, %34 : tensor<256xf32, #blocked> + %36 = tt.broadcast %25 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %37 = arith.mulf %35, %36 : tensor<256xf32, #blocked> + %38 = arith.mulf %29, %37 : tensor<256xf32, #blocked> + %39 = arith.select %2, %38, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %55 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %55 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %41 = arith.addf %40, %cst_1 : f32 + %42 = arith.divf %25, %cst_0 : tensor<1xf32, #blocked> + %43 = arith.mulf %29, %cst_3 : tensor<256xf32, #blocked> + %44 = tt.splat %32 : (f32) -> tensor<256xf32, #blocked> + %45 = arith.subf %43, %44 : tensor<256xf32, #blocked> + %46 = tt.splat %41 : (f32) -> tensor<256xf32, #blocked> + %47 = arith.mulf %37, %46 : tensor<256xf32, #blocked> + %48 = arith.subf %45, %47 : tensor<256xf32, #blocked> + %49 = tt.broadcast %42 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %50 = arith.mulf %49, %48 : tensor<256xf32, #blocked> + %51 = arith.addf %28, %50 : tensor<256xf32, #blocked> + tt.store %27, %51, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %52 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %53 = tt.addptr %52, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %54 = arith.truncf %51 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %53, %54, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttgir b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8efe09e06ac45cb16955328477190b028cc24b64 --- /dev/null +++ b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttgir @@ -0,0 +1,100 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<0> : tensor<1xi64, #blocked> + %cst_1 = arith.constant dense<50257> : tensor<1xi64, #blocked> + %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked> + %cst_3 = arith.constant 9.99999974E-6 : f32 + %cst_4 = arith.constant 2.560000e+02 : f32 + %cst_5 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %c512_i32 = arith.constant 512 : i32 + %cst_6 = arith.constant dense<50257> : tensor<1xi64, #blocked1> + %cst_7 = arith.constant dense<0> : tensor<1xi64, #blocked1> + %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_9 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.remsi %0, %c512_i32 : i32 + %4 = tt.addptr %arg0, %0 : !tt.ptr, i32 + %5 = tt.splat %4 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %6 = tt.splat %4 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked1> + %7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked> + %8 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked1> + %9 = arith.muli %3, %c256_i32 : i32 + %10 = tt.splat %9 : (i32) -> tensor<256xi32, #blocked> + %11 = arith.addi %1, %10 : tensor<256xi32, #blocked> + %12 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %13 = tt.addptr %12, %11 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %14 = tt.load %13, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %15 = arith.muli %0, %c256_i32 : i32 + %16 = tt.splat %15 : (i32) -> tensor<256xi32, #blocked> + %17 = arith.addi %1, %16 : tensor<256xi32, #blocked> + %18 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %19 = tt.addptr %18, %17 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %20 = tt.load %19, %2, %cst_9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %21 = arith.extf %20 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %22 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %23 = tt.addptr %22, %17 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %24 = tt.load %23, %2, %cst_9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %25 = arith.extf %24 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %26 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %27 = tt.addptr %26, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %28 = tt.load %27, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %29 = arith.addi %7, %cst_1 : tensor<1xi64, #blocked> + %30 = arith.addi %8, %cst_6 : tensor<1xi64, #blocked1> + %31 = arith.cmpi slt, %7, %cst_0 : tensor<1xi64, #blocked> + %32 = arith.cmpi slt, %8, %cst_7 : tensor<1xi64, #blocked1> + %33 = arith.select %31, %29, %7 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked> + %34 = arith.select %32, %30, %8 : tensor<1xi1, #blocked1>, tensor<1xi64, #blocked1> + %35 = arith.cmpi sge, %34, %cst_7 : tensor<1xi64, #blocked1> + %36 = arith.cmpi slt, %34, %cst_6 : tensor<1xi64, #blocked1> + %37 = arith.andi %35, %36 : tensor<1xi1, #blocked1> + tt.assert %37, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<1xi1, #blocked1> + %38 = arith.muli %33, %cst_2 : tensor<1xi64, #blocked> + %39 = tt.broadcast %38 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked> + %40 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> + %41 = arith.addi %40, %39 : tensor<256xi64, #blocked> + %42 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %43 = tt.addptr %42, %41 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> + %44 = tt.load %43, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %45 = arith.addf %44, %14 : tensor<256xf32, #blocked> + %46 = arith.addf %45, %21 : tensor<256xf32, #blocked> + %47 = arith.addf %46, %25 : tensor<256xf32, #blocked> + %48 = arith.select %2, %47, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %49 = "tt.reduce"(%48) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %69 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %69 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %50 = arith.addf %49, %cst_5 : f32 + %51 = arith.divf %50, %cst_4 : f32 + %52 = tt.splat %51 : (f32) -> tensor<256xf32, #blocked> + %53 = arith.subf %47, %52 : tensor<256xf32, #blocked> + %54 = arith.mulf %53, %53 : tensor<256xf32, #blocked> + %55 = arith.select %2, %54, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %56 = "tt.reduce"(%55) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %69 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %69 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %57 = arith.addf %56, %cst_5 : f32 + %58 = arith.divf %57, %cst_4 : f32 + %59 = arith.addf %58, %cst_3 : f32 + %60 = tt.extern_elementwise %59 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %61 = tt.splat %60 : (f32) -> tensor<256xf32, #blocked> + %62 = arith.mulf %53, %61 : tensor<256xf32, #blocked> + %63 = arith.mulf %62, %28 : tensor<256xf32, #blocked> + %64 = tt.splat %arg6 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %65 = tt.addptr %64, %17 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %65, %47, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %66 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %67 = tt.addptr %66, %17 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %68 = arith.truncf %63 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %67, %68, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ttgir b/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..552ef7804443cde50f69b432e0224d07e338ee3e --- /dev/null +++ b/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ttgir @@ -0,0 +1,16 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<512xi32, #blocked> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.cubin b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..7214fd933149b50129b035df4a070804d38ad830 Binary files /dev/null and b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.cubin differ diff --git a/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ptx b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..b0d269ae0f8fadc3a630a2b150507e8a1f4a2279 --- /dev/null +++ b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ptx @@ -0,0 +1,764 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4e +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4e( + .param .u64 triton__0d1d2d3de4e_param_0, + .param .u64 triton__0d1d2d3de4e_param_1, + .param .u64 triton__0d1d2d3de4e_param_2, + .param .u32 triton__0d1d2d3de4e_param_3, + .param .u32 triton__0d1d2d3de4e_param_4 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<52>; + .reg .b32 %r<152>; + .reg .f32 %f<107>; + .reg .b64 %rd<30>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd12, [triton__0d1d2d3de4e_param_0]; + ld.param.u64 %rd13, [triton__0d1d2d3de4e_param_1]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r84, %tid.x; + and.b32 %r85, %r84, 31; + ld.param.u64 %rd14, [triton__0d1d2d3de4e_param_2]; + shl.b32 %r86, %r84, 2; + and.b32 %r87, %r86, 60; + .loc 1 24 33 + bfe.u32 %r88, %r84, 5, 3; + bfe.u32 %r89, %r84, 4, 1; + shl.b32 %r90, %r88, 1; + or.b32 %r91, %r90, %r89; + .loc 1 21 28 + mov.u32 %r1, %ctaid.x; + .loc 1 21 33 + shl.b32 %r92, %r1, 6; + .loc 1 22 23 + or.b32 %r93, %r92, %r87; + .loc 1 31 47 + shl.b32 %r94, %r91, 17; + .loc 1 31 40 + add.s32 %r95, %r94, %r93; + add.s32 %r96, %r95, 2097152; + add.s32 %r97, %r95, 4194304; + add.s32 %r98, %r95, 6291456; + .loc 1 31 34 + mul.wide.s32 %rd15, %r95, 4; + add.s64 %rd1, %rd12, %rd15; + mul.wide.s32 %rd16, %r96, 4; + add.s64 %rd2, %rd12, %rd16; + mul.wide.s32 %rd17, %r97, 4; + add.s64 %rd3, %rd12, %rd17; + mul.wide.s32 %rd18, %r98, 4; + add.s64 %rd4, %rd12, %rd18; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 31 53 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r6; + @!%p1 mov.u32 %r11, %r6; + @!%p1 mov.u32 %r12, %r6; + @!%p1 mov.u32 %r13, %r6; + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r6; + @!%p1 mov.u32 %r19, %r6; + @!%p1 mov.u32 %r20, %r6; + @!%p1 mov.u32 %r21, %r6; + mov.b32 %f1, %r18; + mov.b32 %f2, %r19; + mov.b32 %f3, %r20; + mov.b32 %f4, %r21; + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + mov.u32 %r29, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r26, %r6; + @!%p1 mov.u32 %r27, %r6; + @!%p1 mov.u32 %r28, %r6; + @!%p1 mov.u32 %r29, %r6; + .loc 1 34 38 + add.f32 %f5, %f1, 0f00000000; + add.f32 %f6, %f2, 0f00000000; + add.f32 %f7, %f3, 0f00000000; + add.f32 %f8, %f4, 0f00000000; + .loc 1 28 27 + or.b32 %r99, %r91, 112; + .loc 1 29 25 + setp.lt.u32 %p36, %r99, 120; + .loc 1 31 47 + shl.b32 %r100, %r99, 17; + .loc 1 31 40 + add.s32 %r101, %r95, 8388608; + add.s32 %r102, %r95, 10485760; + add.s32 %r103, %r95, 12582912; + add.s32 %r104, %r100, %r93; + .loc 1 31 34 + mul.wide.s32 %rd19, %r101, 4; + add.s64 %rd5, %rd12, %rd19; + mul.wide.s32 %rd20, %r102, 4; + add.s64 %rd6, %rd12, %rd20; + mul.wide.s32 %rd21, %r103, 4; + add.s64 %rd7, %rd12, %rd21; + mul.wide.s32 %rd22, %r104, 4; + add.s64 %rd8, %rd12, %rd22; + .loc 1 31 53 + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + mov.u32 %r37, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd5 + 0 ]; + @!%p1 mov.u32 %r34, %r6; + @!%p1 mov.u32 %r35, %r6; + @!%p1 mov.u32 %r36, %r6; + @!%p1 mov.u32 %r37, %r6; + mov.u32 %r42, 0x0; + mov.u32 %r43, 0x0; + mov.u32 %r44, 0x0; + mov.u32 %r45, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ]; + @!%p1 mov.u32 %r42, %r6; + @!%p1 mov.u32 %r43, %r6; + @!%p1 mov.u32 %r44, %r6; + @!%p1 mov.u32 %r45, %r6; + mov.u32 %r50, 0x0; + mov.u32 %r51, 0x0; + mov.u32 %r52, 0x0; + mov.u32 %r53, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd7 + 0 ]; + @!%p1 mov.u32 %r50, %r6; + @!%p1 mov.u32 %r51, %r6; + @!%p1 mov.u32 %r52, %r6; + @!%p1 mov.u32 %r53, %r6; + mov.u32 %r58, 0x0; + mov.u32 %r59, 0x0; + mov.u32 %r60, 0x0; + mov.u32 %r61, 0x0; + @%p36 ld.global.L1::evict_first.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd8 + 0 ]; + @!%p36 mov.u32 %r58, %r6; + @!%p36 mov.u32 %r59, %r6; + @!%p36 mov.u32 %r60, %r6; + @!%p36 mov.u32 %r61, %r6; + mov.b32 %f9, %r58; + mov.b32 %f10, %r59; + mov.b32 %f11, %r60; + mov.b32 %f12, %r61; + mov.b32 %f13, %r2; + mov.b32 %f14, %r10; + .loc 1 34 38 + add.f32 %f15, %f14, 0f00000000; + add.f32 %f16, %f13, 0f00000000; + .loc 1 31 53 + mov.b32 %f17, %r42; + mov.b32 %f18, %r34; + .loc 1 34 38 + add.f32 %f19, %f16, %f18; + add.f32 %f20, %f15, %f17; + .loc 1 31 53 + mov.b32 %f21, %r3; + mov.b32 %f22, %r11; + .loc 1 34 38 + add.f32 %f23, %f22, 0f00000000; + add.f32 %f24, %f21, 0f00000000; + .loc 1 31 53 + mov.b32 %f25, %r43; + mov.b32 %f26, %r35; + .loc 1 34 38 + add.f32 %f27, %f24, %f26; + add.f32 %f28, %f23, %f25; + .loc 1 31 53 + mov.b32 %f29, %r4; + mov.b32 %f30, %r12; + .loc 1 34 38 + add.f32 %f31, %f30, 0f00000000; + add.f32 %f32, %f29, 0f00000000; + .loc 1 31 53 + mov.b32 %f33, %r44; + mov.b32 %f34, %r36; + .loc 1 34 38 + add.f32 %f35, %f32, %f34; + add.f32 %f36, %f31, %f33; + .loc 1 31 53 + mov.b32 %f37, %r5; + mov.b32 %f38, %r13; + .loc 1 34 38 + add.f32 %f39, %f38, 0f00000000; + add.f32 %f40, %f37, 0f00000000; + .loc 1 31 53 + mov.b32 %f41, %r45; + mov.b32 %f42, %r37; + .loc 1 34 38 + add.f32 %f43, %f40, %f42; + add.f32 %f44, %f39, %f41; + selp.f32 %f45, %f9, 0f80000000, %p36; + selp.f32 %f46, %f10, 0f80000000, %p36; + selp.f32 %f47, %f11, 0f80000000, %p36; + selp.f32 %f48, %f12, 0f80000000, %p36; + .loc 1 22 44 + and.b32 %r105, %r84, 63; + .loc 1 22 23 + or.b32 %r106, %r92, %r105; +$L__tmp1: + .loc 2 233 15 + add.f32 %f49, %f19, %f20; + add.f32 %f50, %f27, %f28; + add.f32 %f51, %f35, %f36; + add.f32 %f52, %f43, %f44; +$L__tmp2: + .loc 1 31 53 + mov.b32 %f53, %r26; + mov.b32 %f54, %r50; + .loc 1 34 38 + add.f32 %f55, %f5, %f54; + add.f32 %f56, %f53, 0f00000000; + add.f32 %f57, %f55, %f49; + add.f32 %f58, %f56, %f45; + .loc 1 31 53 + mov.b32 %f59, %r27; + mov.b32 %f60, %r51; + .loc 1 34 38 + add.f32 %f61, %f6, %f60; + add.f32 %f62, %f59, 0f00000000; + add.f32 %f63, %f61, %f50; + add.f32 %f64, %f62, %f46; + .loc 1 31 53 + mov.b32 %f65, %r28; + mov.b32 %f66, %r52; + .loc 1 34 38 + add.f32 %f67, %f7, %f66; + add.f32 %f68, %f65, 0f00000000; + add.f32 %f69, %f67, %f51; + add.f32 %f70, %f68, %f47; + .loc 1 31 53 + mov.b32 %f71, %r29; + mov.b32 %f72, %r53; + .loc 1 34 38 + add.f32 %f73, %f8, %f72; + add.f32 %f74, %f71, 0f00000000; + add.f32 %f75, %f73, %f52; + add.f32 %f76, %f74, %f48; +$L__tmp3: + .loc 2 233 15 + add.f32 %f77, %f58, %f57; + add.f32 %f78, %f64, %f63; + add.f32 %f79, %f70, %f69; + add.f32 %f80, %f76, %f75; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r107, %f77; + shfl.sync.bfly.b32 %r108, %r107, 16, 31, -1; + mov.b32 %f81, %r108; +$L__tmp5: + .loc 2 233 15 + add.f32 %f82, %f77, %f81; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r109, %f78; + shfl.sync.bfly.b32 %r110, %r109, 16, 31, -1; + mov.b32 %f83, %r110; +$L__tmp7: + .loc 2 233 15 + add.f32 %f84, %f78, %f83; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r111, %f79; + shfl.sync.bfly.b32 %r112, %r111, 16, 31, -1; + mov.b32 %f85, %r112; +$L__tmp9: + .loc 2 233 15 + add.f32 %f86, %f79, %f85; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r113, %f80; + shfl.sync.bfly.b32 %r114, %r113, 16, 31, -1; + mov.b32 %f87, %r114; +$L__tmp11: + .loc 2 233 15 + add.f32 %f88, %f80, %f87; +$L__tmp12: + .loc 2 243 36 + setp.lt.u32 %p41, %r85, 16; + shl.b32 %r115, %r88, 2; + shl.b32 %r116, %r87, 5; + or.b32 %r117, %r116, %r115; + mov.u32 %r118, global_smem; + add.s32 %r66, %r118, %r117; + mov.b32 %r67, %f82; + @%p41 st.shared.b32 [ %r66 + 0 ], %r67; + or.b32 %r119, %r116, 32; + or.b32 %r120, %r119, %r115; + add.s32 %r68, %r118, %r120; + mov.b32 %r69, %f84; + @%p41 st.shared.b32 [ %r68 + 0 ], %r69; + or.b32 %r121, %r116, 64; + or.b32 %r122, %r121, %r115; + add.s32 %r70, %r118, %r122; + mov.b32 %r71, %f86; + @%p41 st.shared.b32 [ %r70 + 0 ], %r71; + or.b32 %r123, %r116, 96; + or.b32 %r124, %r123, %r115; + add.s32 %r72, %r118, %r124; + mov.b32 %r73, %f88; + @%p41 st.shared.b32 [ %r72 + 0 ], %r73; + bar.sync 0; + setp.lt.s32 %p45, %r84, 512; + add.s32 %r75, %r118, %r86; + @%p45 ld.shared.b32 %r74, [ %r75 + 0 ]; + mov.b32 %f89, %r74; + shfl.sync.bfly.b32 %r125, %r74, 4, 31, -1; + mov.b32 %f90, %r125; +$L__tmp13: + .loc 2 233 15 + add.f32 %f91, %f89, %f90; +$L__tmp14: + .loc 2 243 36 + mov.b32 %r126, %f91; + shfl.sync.bfly.b32 %r127, %r126, 2, 31, -1; + mov.b32 %f92, %r127; +$L__tmp15: + .loc 2 233 15 + add.f32 %f93, %f91, %f92; +$L__tmp16: + .loc 2 243 36 + mov.b32 %r128, %f93; + shfl.sync.bfly.b32 %r129, %r128, 1, 31, -1; + mov.b32 %f94, %r129; +$L__tmp17: + .loc 2 233 15 + add.f32 %f95, %f93, %f94; +$L__tmp18: + .loc 2 243 36 + and.b32 %r130, %r84, 7; + setp.eq.s32 %p51, %r130, 0; + and.pred %p46, %p45, %p51; + mov.b32 %r77, %f95; + @%p46 st.shared.b32 [ %r75 + 0 ], %r77; + add.s32 %r79, %r75, 1024; + @%p45 ld.shared.b32 %r78, [ %r79 + 0 ]; + mov.b32 %f96, %r78; + shfl.sync.bfly.b32 %r131, %r78, 4, 31, -1; + mov.b32 %f97, %r131; +$L__tmp19: + .loc 2 233 15 + add.f32 %f98, %f96, %f97; +$L__tmp20: + .loc 2 243 36 + mov.b32 %r132, %f98; + shfl.sync.bfly.b32 %r133, %r132, 2, 31, -1; + mov.b32 %f99, %r133; +$L__tmp21: + .loc 2 233 15 + add.f32 %f100, %f98, %f99; +$L__tmp22: + .loc 2 243 36 + mov.b32 %r134, %f100; + shfl.sync.bfly.b32 %r135, %r134, 1, 31, -1; + mov.b32 %f101, %r135; +$L__tmp23: + .loc 2 233 15 + add.f32 %f102, %f100, %f101; +$L__tmp24: + .loc 2 243 36 + mov.b32 %r81, %f102; + @%p46 st.shared.b32 [ %r79 + 0 ], %r81; + bar.sync 0; + add.s32 %r136, %r118, %r116; + ld.shared.f32 %f103, [%r136]; + add.s32 %r137, %r118, %r119; + ld.shared.f32 %f104, [%r137]; + add.s32 %r138, %r118, %r121; + ld.shared.f32 %f105, [%r138]; + add.s32 %r139, %r118, %r123; + ld.shared.f32 %f106, [%r139]; +$L__tmp25: + .loc 1 35 28 + bar.sync 0; + shl.b32 %r140, %r87, 2; + add.s32 %r141, %r118, %r140; + st.shared.f32 [%r141], %f103; + st.shared.f32 [%r141+4], %f104; + st.shared.f32 [%r141+8], %f105; + st.shared.f32 [%r141+12], %f106; + bar.sync 0; + shl.b32 %r142, %r105, 2; + add.s32 %r143, %r118, %r142; + .loc 1 36 20 + shr.s32 %r145, %r106, 31; + shr.u32 %r146, %r145, 24; + add.s32 %r147, %r106, %r146; + shr.s32 %r148, %r147, 8; + and.b32 %r149, %r147, -256; + sub.s32 %r150, %r106, %r149; + .loc 1 38 30 + mul.wide.s32 %rd23, %r148, 8; + add.s64 %rd10, %rd13, %rd23; + .loc 1 45 55 + ld.shared.u32 %r83, [%r143]; + .loc 1 38 35 + mov.u64 %rd9, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd9 }, [ %rd10 + 0 ]; + .loc 1 41 32 + shr.u64 %rd24, %rd9, 54; + and.b64 %rd25, %rd24, 512; + add.s64 %rd26, %rd25, %rd9; + .loc 1 45 30 + shl.b64 %rd27, %rd26, 10; + add.s64 %rd28, %rd14, %rd27; + mul.wide.s32 %rd29, %r150, 4; + add.s64 %rd11, %rd28, %rd29; + .loc 1 45 55 + and.b32 %r151, %r84, 192; + setp.eq.s32 %p50, %r151, 0; + mov.u32 %r82, 0x0; + @%p50 atom.global.gpu.acq_rel.add.f32 %r82, [ %rd11 + 0 ], %r83; + .loc 1 45 4 + ret; +$L__tmp26: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 264 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 105 +.b8 107 +.b8 53 +.b8 118 +.b8 120 +.b8 55 +.b8 112 +.b8 50 +.b8 50 +.b8 102 +.b8 112 +.b8 107 +.b8 52 +.b8 100 +.b8 99 +.b8 118 +.b8 104 +.b8 53 +.b8 53 +.b8 122 +.b8 105 +.b8 109 +.b8 119 +.b8 52 +.b8 116 +.b8 53 +.b8 110 +.b8 114 +.b8 53 +.b8 122 +.b8 110 +.b8 50 +.b8 98 +.b8 55 +.b8 105 +.b8 110 +.b8 117 +.b8 106 +.b8 120 +.b8 106 +.b8 97 +.b8 117 +.b8 120 +.b8 115 +.b8 104 +.b8 108 +.b8 106 +.b8 117 +.b8 109 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 105 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp24 +.b8 2 +.b8 35 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp24 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp4 +.b64 $L__tmp25 +.b8 2 +.b8 35 +.b8 25 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.cubin b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..2e198efe089a1842c202e04b198c8d1491b5c619 Binary files /dev/null and b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.cubin differ diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ptx b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..12dbd4fd575fd1b129e5c7ce0c0047cbd528298c --- /dev/null +++ b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ptx @@ -0,0 +1,301 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u32 triton__0d1d2de_param_2 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<3>; + .reg .b16 %rs<5>; + .reg .b32 %r<17>; + .reg .b64 %rd<7>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1d2de_param_0]; + ld.param.u64 %rd4, [triton__0d1d2de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r12, %tid.x; + shl.b32 %r13, %r12, 2; + and.b32 %r14, %r13, 508; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r15, %r1, 9; + .loc 1 21 23 + or.b32 %r16, %r15, %r14; + .loc 1 24 30 + mul.wide.s32 %rd5, %r16, 2; + add.s64 %rd1, %rd3, %rd5; + mov.pred %p1, -1; + .loc 1 24 35 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ]; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + .loc 1 24 44 + cvt.f32.bf16 %r8, %rs1; + cvt.f32.bf16 %r9, %rs2; + cvt.f32.bf16 %r10, %rs3; + cvt.f32.bf16 %r11, %rs4; + .loc 1 26 25 + mul.wide.s32 %rd6, %r16, 4; + add.s64 %rd2, %rd4, %rd6; + .loc 1 26 36 + @%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r8, %r9, %r10, %r11 }; + .loc 1 26 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/zl/czl6nmwasl7k4ic55xowihczcooh3mhu5v6ls6w2xzqqocdc2da7.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 122 +.b8 108 +.b8 54 +.b8 110 +.b8 109 +.b8 119 +.b8 97 +.b8 115 +.b8 108 +.b8 55 +.b8 107 +.b8 52 +.b8 105 +.b8 99 +.b8 53 +.b8 53 +.b8 120 +.b8 111 +.b8 119 +.b8 105 +.b8 104 +.b8 99 +.b8 122 +.b8 99 +.b8 111 +.b8 111 +.b8 104 +.b8 51 +.b8 109 +.b8 104 +.b8 117 +.b8 53 +.b8 118 +.b8 54 +.b8 108 +.b8 115 +.b8 54 +.b8 119 +.b8 50 +.b8 120 +.b8 122 +.b8 113 +.b8 113 +.b8 111 +.b8 99 +.b8 100 +.b8 99 +.b8 50 +.b8 100 +.b8 97 +.b8 55 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 122 +.b8 108 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttir b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..50602f214a3439fdc49ae67bf8b80948941d601b --- /dev/null +++ b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttir @@ -0,0 +1,18 @@ +module { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> + %3 = tt.splat %1 : (i32) -> tensor<512xi32> + %4 = arith.addi %3, %2 : tensor<512xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16> + %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr> + %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32> + tt.return + } +} diff --git a/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.cubin b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..1332a22b83c749e8064e26027a5be736090f413d Binary files /dev/null and b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.cubin differ diff --git a/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.llir b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..0378ebb3be855988af3082ca4215072873f1d817 --- /dev/null +++ b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.llir @@ -0,0 +1,477 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed" +@assertFile_1 = internal constant [38 x i8] c"" +@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257" +@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed" +@assertFile_0 = internal constant [38 x i8] c"" +@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257" +@global_smem = external addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr + +define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %9 = and i32 %8, 31, !dbg !10 + %10 = lshr i32 %8, 5, !dbg !10 + %11 = and i32 %10, 1, !dbg !10 + %urem = shl i32 %8, 2, !dbg !10 + %12 = and i32 %urem, 252, !dbg !10 + %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11 + %14 = sext i32 %13 to i64, !dbg !12 + %15 = getelementptr i64, ptr addrspace(1) %0, i64 %14, !dbg !12 + %16 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %15, i1 true) #6, !dbg !13 + %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %15, i1 true) #6, !dbg !13 + %18 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %15, i1 true) #6, !dbg !13 + %19 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %15, i1 true) #6, !dbg !13 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %15, i1 true) #6, !dbg !13 + %21 = srem i32 %13, 512, !dbg !14 + %22 = shl nsw i32 %21, 8, !dbg !15 + %23 = or i32 %22, %12, !dbg !16 + %24 = sext i32 %23 to i64, !dbg !17 + %25 = getelementptr float, ptr addrspace(1) %2, i64 %24, !dbg !17 + %26 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !18 + %27 = extractvalue { i32, i32, i32, i32 } %26, 0, !dbg !18 + %28 = extractvalue { i32, i32, i32, i32 } %26, 1, !dbg !18 + %29 = extractvalue { i32, i32, i32, i32 } %26, 2, !dbg !18 + %30 = extractvalue { i32, i32, i32, i32 } %26, 3, !dbg !18 + %31 = bitcast i32 %27 to float, !dbg !18 + %32 = bitcast i32 %28 to float, !dbg !18 + %33 = bitcast i32 %29 to float, !dbg !18 + %34 = bitcast i32 %30 to float, !dbg !18 + %35 = add i64 %20, 50257, !dbg !19 + %36 = icmp slt i64 %16, 0, !dbg !20 + %37 = icmp slt i64 %20, 0, !dbg !20 + %38 = select i1 %37, i64 %35, i64 %20, !dbg !21 + %39 = icmp ugt i64 %38, 50256, !dbg !22 + br i1 %39, label %40, label %41, !dbg !23 + +40: ; preds = %7 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !23 + br label %41, !dbg !23 + +41: ; preds = %40, %7 + %42 = shl i64 %16, 8, !dbg !24 + %43 = add i64 %42, 12865792, !dbg !24 + %44 = select i1 %36, i64 %43, i64 %42, !dbg !24 + %45 = zext nneg i32 %12 to i64 + %46 = or i64 %44, %45, !dbg !25 + %47 = getelementptr float, ptr addrspace(1) %1, i64 %46, !dbg !26 + %48 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %47, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !27 + %49 = extractvalue { i32, i32, i32, i32 } %48, 0, !dbg !27 + %50 = extractvalue { i32, i32, i32, i32 } %48, 1, !dbg !27 + %51 = extractvalue { i32, i32, i32, i32 } %48, 2, !dbg !27 + %52 = extractvalue { i32, i32, i32, i32 } %48, 3, !dbg !27 + %53 = bitcast i32 %49 to float, !dbg !27 + %54 = bitcast i32 %50 to float, !dbg !27 + %55 = bitcast i32 %51 to float, !dbg !27 + %56 = bitcast i32 %52 to float, !dbg !27 + %57 = fadd float %31, %53, !dbg !28 + %58 = fadd float %32, %54, !dbg !28 + %59 = fadd float %33, %55, !dbg !28 + %60 = fadd float %34, %56, !dbg !28 + %61 = fadd float %57, 0.000000e+00, !dbg !29 + %62 = fadd float %58, 0.000000e+00, !dbg !29 + %63 = fadd float %59, 0.000000e+00, !dbg !29 + %64 = fadd float %60, 0.000000e+00, !dbg !29 + %65 = fsub float %57, %61, !dbg !33 + %66 = fsub float %58, %62, !dbg !33 + %67 = fsub float %59, %63, !dbg !33 + %68 = fsub float %60, %64, !dbg !33 + %69 = fmul float %57, %65, !dbg !34 + %70 = fmul float %58, %66, !dbg !34 + %71 = fmul float %59, %67, !dbg !34 + %72 = fmul float %60, %68, !dbg !34 + %73 = fadd float %69, 0.000000e+00, !dbg !35 + %74 = fadd float %70, 0.000000e+00, !dbg !35 + %75 = fadd float %71, 0.000000e+00, !dbg !35 + %76 = fadd float %72, 0.000000e+00, !dbg !35 + %77 = fsub float %62, %61, !dbg !36 + %78 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !40 + %79 = fmul float %78, %77, !dbg !41 + %80 = fadd float %61, %79, !dbg !42 + %81 = fadd float %73, %74, !dbg !43 + %82 = fmul float %77, %77, !dbg !44 + %83 = fmul float %78, %82, !dbg !45 + %84 = fadd float %83, %81, !dbg !46 + %85 = fsub float %63, %80, !dbg !36 + %86 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !40 + %87 = fmul float %86, %85, !dbg !41 + %88 = fadd float %80, %87, !dbg !42 + %89 = fadd float %75, %84, !dbg !43 + %90 = fmul float %85, %85, !dbg !44 + %91 = fmul float %90, 2.000000e+00, !dbg !47 + %92 = fmul float %86, %91, !dbg !45 + %93 = fadd float %89, %92, !dbg !46 + %94 = fsub float %64, %88, !dbg !36 + %95 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !40 + %96 = fmul float %95, %94, !dbg !41 + %97 = fadd float %88, %96, !dbg !42 + %98 = fadd float %76, %93, !dbg !43 + %99 = fmul float %94, %94, !dbg !44 + %100 = fmul float %99, 3.000000e+00, !dbg !47 + %101 = fmul float %95, %100, !dbg !45 + %102 = fadd float %98, %101, !dbg !46 + %103 = bitcast float %97 to i32, !dbg !48 + %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 16, i32 31), !dbg !48 + %105 = bitcast i32 %104 to float, !dbg !48 + %106 = bitcast float %102 to i32, !dbg !48 + %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !48 + %108 = bitcast i32 %107 to float, !dbg !48 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !48 + %110 = bitcast i32 %109 to float, !dbg !48 + %111 = fsub float %105, %97, !dbg !36 + %112 = fadd float %110, 4.000000e+00, !dbg !50 + %113 = fcmp oeq float %112, 0.000000e+00, !dbg !51 + %114 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %110, float %112) #6, !dbg !40 + %115 = select i1 %113, float 0.000000e+00, float %114, !dbg !52 + %116 = fmul float %115, %111, !dbg !41 + %117 = fadd float %97, %116, !dbg !42 + %118 = fadd float %102, %108, !dbg !43 + %119 = fmul float %111, %111, !dbg !44 + %120 = fmul float %119, 4.000000e+00, !dbg !47 + %121 = fmul float %115, %120, !dbg !45 + %122 = fadd float %118, %121, !dbg !46 + %123 = bitcast float %117 to i32, !dbg !48 + %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 8, i32 31), !dbg !48 + %125 = bitcast i32 %124 to float, !dbg !48 + %126 = bitcast float %122 to i32, !dbg !48 + %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 8, i32 31), !dbg !48 + %128 = bitcast i32 %127 to float, !dbg !48 + %129 = bitcast float %112 to i32, !dbg !48 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 8, i32 31), !dbg !48 + %131 = bitcast i32 %130 to float, !dbg !48 + %132 = fsub float %125, %117, !dbg !36 + %133 = fadd float %112, %131, !dbg !50 + %134 = fcmp oeq float %133, 0.000000e+00, !dbg !51 + %135 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %131, float %133) #6, !dbg !40 + %136 = select i1 %134, float 0.000000e+00, float %135, !dbg !52 + %137 = fmul float %136, %132, !dbg !41 + %138 = fadd float %117, %137, !dbg !42 + %139 = fadd float %122, %128, !dbg !43 + %140 = fmul float %132, %132, !dbg !44 + %141 = fmul float %112, %140, !dbg !47 + %142 = fmul float %136, %141, !dbg !45 + %143 = fadd float %139, %142, !dbg !46 + %144 = bitcast float %138 to i32, !dbg !48 + %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !48 + %146 = bitcast i32 %145 to float, !dbg !48 + %147 = bitcast float %143 to i32, !dbg !48 + %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 4, i32 31), !dbg !48 + %149 = bitcast i32 %148 to float, !dbg !48 + %150 = bitcast float %133 to i32, !dbg !48 + %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 4, i32 31), !dbg !48 + %152 = bitcast i32 %151 to float, !dbg !48 + %153 = fsub float %146, %138, !dbg !36 + %154 = fadd float %133, %152, !dbg !50 + %155 = fcmp oeq float %154, 0.000000e+00, !dbg !51 + %156 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %152, float %154) #6, !dbg !40 + %157 = select i1 %155, float 0.000000e+00, float %156, !dbg !52 + %158 = fmul float %157, %153, !dbg !41 + %159 = fadd float %138, %158, !dbg !42 + %160 = fadd float %143, %149, !dbg !43 + %161 = fmul float %153, %153, !dbg !44 + %162 = fmul float %133, %161, !dbg !47 + %163 = fmul float %157, %162, !dbg !45 + %164 = fadd float %160, %163, !dbg !46 + %165 = bitcast float %159 to i32, !dbg !48 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !48 + %167 = bitcast i32 %166 to float, !dbg !48 + %168 = bitcast float %164 to i32, !dbg !48 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 2, i32 31), !dbg !48 + %170 = bitcast i32 %169 to float, !dbg !48 + %171 = bitcast float %154 to i32, !dbg !48 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 2, i32 31), !dbg !48 + %173 = bitcast i32 %172 to float, !dbg !48 + %174 = fsub float %167, %159, !dbg !36 + %175 = fadd float %154, %173, !dbg !50 + %176 = fcmp oeq float %175, 0.000000e+00, !dbg !51 + %177 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %173, float %175) #6, !dbg !40 + %178 = select i1 %176, float 0.000000e+00, float %177, !dbg !52 + %179 = fmul float %178, %174, !dbg !41 + %180 = fadd float %159, %179, !dbg !42 + %181 = fadd float %164, %170, !dbg !43 + %182 = fmul float %174, %174, !dbg !44 + %183 = fmul float %154, %182, !dbg !47 + %184 = fmul float %178, %183, !dbg !45 + %185 = fadd float %181, %184, !dbg !46 + %186 = bitcast float %180 to i32, !dbg !48 + %187 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 1, i32 31), !dbg !48 + %188 = bitcast i32 %187 to float, !dbg !48 + %189 = bitcast float %185 to i32, !dbg !48 + %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 1, i32 31), !dbg !48 + %191 = bitcast i32 %190 to float, !dbg !48 + %192 = bitcast float %175 to i32, !dbg !48 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !48 + %194 = bitcast i32 %193 to float, !dbg !48 + %195 = fsub float %188, %180, !dbg !36 + %196 = fadd float %175, %194, !dbg !50 + %197 = fcmp oeq float %196, 0.000000e+00, !dbg !51 + %198 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %194, float %196) #6, !dbg !40 + %199 = select i1 %197, float 0.000000e+00, float %198, !dbg !52 + %200 = fmul float %195, %199, !dbg !41 + %201 = fadd float %180, %200, !dbg !42 + %202 = fadd float %185, %191, !dbg !43 + %203 = fmul float %195, %195, !dbg !44 + %204 = fmul float %175, %203, !dbg !47 + %205 = fmul float %199, %204, !dbg !45 + %206 = fadd float %202, %205, !dbg !46 + %207 = icmp eq i32 %9, 0, !dbg !48 + %208 = zext nneg i32 %11 to i64, !dbg !48 + %209 = getelementptr float, ptr addrspace(3) @global_smem, i64 %208, !dbg !48 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, float %201, i1 %207) #6, !dbg !48 + %210 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %208, !dbg !48 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %210, float %206, i1 %207) #6, !dbg !48 + %211 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %208, !dbg !48 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %211, float %196, i1 %207) #6, !dbg !48 + tail call void @llvm.nvvm.barrier0(), !dbg !48 + %212 = icmp slt i32 %8, 2, !dbg !48 + %213 = sext i32 %8 to i64, !dbg !48 + %214 = getelementptr float, ptr addrspace(3) @global_smem, i64 %213, !dbg !48 + %215 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %214, i1 %212) #6, !dbg !48 + %216 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %213, !dbg !48 + %217 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %216, i1 %212) #6, !dbg !48 + %218 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %213, !dbg !48 + %219 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %218, i1 %212) #6, !dbg !48 + %220 = bitcast float %215 to i32, !dbg !48 + %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 1, i32 31), !dbg !48 + %222 = bitcast i32 %221 to float, !dbg !48 + %223 = bitcast float %217 to i32, !dbg !48 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 1, i32 31), !dbg !48 + %225 = bitcast i32 %224 to float, !dbg !48 + %226 = bitcast float %219 to i32, !dbg !48 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 1, i32 31), !dbg !48 + %228 = bitcast i32 %227 to float, !dbg !48 + %229 = fsub float %222, %215, !dbg !36 + %230 = fadd float %219, %228, !dbg !50 + %231 = fcmp oeq float %230, 0.000000e+00, !dbg !51 + %232 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %228, float %230) #6, !dbg !40 + %233 = select i1 %231, float 0.000000e+00, float %232, !dbg !52 + %234 = fmul float %229, %233, !dbg !41 + %235 = fadd float %215, %234, !dbg !42 + %236 = fadd float %217, %225, !dbg !43 + %237 = fmul float %229, %229, !dbg !44 + %238 = fmul float %219, %237, !dbg !47 + %239 = fmul float %238, %233, !dbg !45 + %240 = fadd float %236, %239, !dbg !46 + %241 = and i32 %8, 1, !dbg !48 + %242 = icmp eq i32 %241, 0, !dbg !48 + %243 = and i1 %212, %242, !dbg !48 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %214, float %235, i1 %243) #6, !dbg !48 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %216, float %240, i1 %243) #6, !dbg !48 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, float %230, i1 %243) #6, !dbg !48 + tail call void @llvm.nvvm.barrier0(), !dbg !48 + %244 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !48 + %245 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !48 + %246 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !53 + %247 = getelementptr float, ptr addrspace(1) %3, i64 %45, !dbg !54 + %248 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %247, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !55 + br i1 %39, label %249, label %250, !dbg !56 + +249: ; preds = %41 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !56 + br label %250, !dbg !56 + +250: ; preds = %249, %41 + %251 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %47, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !57 + %252 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float 2.560000e+02) #6, !dbg !58 + %253 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float 2.560000e+02) #6, !dbg !58 + %254 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float 2.560000e+02) #6, !dbg !58 + %255 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float 2.560000e+02) #6, !dbg !58 + %256 = fadd float %252, 0x3EE4F8B580000000, !dbg !59 + %257 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !60 + %.not.i = icmp eq i32 %257, 0, !dbg !60 + br i1 %.not.i, label %260, label %258, !dbg !60 + +258: ; preds = %250 + %259 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %256), !dbg !60 + br label %__nv_rsqrtf.exit, !dbg !60 + +260: ; preds = %250 + %261 = tail call float @llvm.nvvm.rsqrt.approx.f(float %256), !dbg !60 + br label %__nv_rsqrtf.exit, !dbg !60 + +__nv_rsqrtf.exit: ; preds = %258, %260 + %.0.i = phi float [ %259, %258 ], [ %261, %260 ], !dbg !60 + %262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !60 + %263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !60 + %264 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !60 + %265 = extractvalue { i32, i32, i32, i32 } %251, 3, !dbg !57 + %266 = bitcast i32 %265 to float, !dbg !57 + %267 = extractvalue { i32, i32, i32, i32 } %246, 3, !dbg !53 + %268 = bitcast i32 %267 to float, !dbg !53 + %269 = fadd float %268, %266, !dbg !61 + %270 = fsub float %269, %244, !dbg !62 + %271 = extractvalue { i32, i32, i32, i32 } %251, 2, !dbg !57 + %272 = bitcast i32 %271 to float, !dbg !57 + %273 = extractvalue { i32, i32, i32, i32 } %246, 2, !dbg !53 + %274 = bitcast i32 %273 to float, !dbg !53 + %275 = fadd float %274, %272, !dbg !61 + %276 = fsub float %275, %244, !dbg !62 + %277 = extractvalue { i32, i32, i32, i32 } %251, 1, !dbg !57 + %278 = bitcast i32 %277 to float, !dbg !57 + %279 = extractvalue { i32, i32, i32, i32 } %246, 1, !dbg !53 + %280 = bitcast i32 %279 to float, !dbg !53 + %281 = fadd float %280, %278, !dbg !61 + %282 = fsub float %281, %244, !dbg !62 + %283 = extractvalue { i32, i32, i32, i32 } %251, 0, !dbg !57 + %284 = bitcast i32 %283 to float, !dbg !57 + %285 = extractvalue { i32, i32, i32, i32 } %246, 0, !dbg !53 + %286 = bitcast i32 %285 to float, !dbg !53 + %287 = fadd float %286, %284, !dbg !61 + %288 = fsub float %287, %244, !dbg !62 + %289 = extractvalue { i32, i32, i32, i32 } %248, 0, !dbg !55 + %290 = bitcast i32 %289 to float, !dbg !55 + %291 = extractvalue { i32, i32, i32, i32 } %248, 1, !dbg !55 + %292 = bitcast i32 %291 to float, !dbg !55 + %293 = extractvalue { i32, i32, i32, i32 } %248, 2, !dbg !55 + %294 = bitcast i32 %293 to float, !dbg !55 + %295 = extractvalue { i32, i32, i32, i32 } %248, 3, !dbg !55 + %296 = bitcast i32 %295 to float, !dbg !55 + %297 = fmul float %288, %.0.i, !dbg !63 + %298 = fmul float %282, %.0.i, !dbg !63 + %299 = fmul float %276, %.0.i, !dbg !63 + %300 = fmul float %270, %.0.i, !dbg !63 + %301 = fmul float %297, %290, !dbg !64 + %302 = fmul float %298, %292, !dbg !64 + %303 = fmul float %299, %294, !dbg !64 + %304 = fmul float %300, %296, !dbg !64 + %305 = shl i32 %13, 8, !dbg !65 + %306 = or i32 %305, %12, !dbg !66 + %307 = sext i32 %306 to i64, !dbg !67 + %308 = getelementptr i16, ptr addrspace(1) %4, i64 %307, !dbg !67 + %309 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %301) #6, !dbg !68 + %310 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %302) #6, !dbg !68 + %311 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %303) #6, !dbg !68 + %312 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %304) #6, !dbg !68 + %313 = insertelement <2 x i16> undef, i16 %309, i64 0, !dbg !68 + %314 = insertelement <2 x i16> %313, i16 %310, i64 1, !dbg !68 + %315 = bitcast <2 x i16> %314 to i32, !dbg !68 + %316 = insertelement <2 x i16> undef, i16 %311, i64 0, !dbg !68 + %317 = insertelement <2 x i16> %316, i16 %312, i64 1, !dbg !68 + %318 = bitcast <2 x i16> %317 to i32, !dbg !68 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %315, i32 %318, ptr addrspace(1) %308, i1 true) #6, !dbg !68 + ret void, !dbg !69 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { + %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 + %.not = icmp eq i32 %1, 0 + br i1 %.not, label %4, label %2 + +2: ; preds = %0 + %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) + br label %6 + +4: ; preds = %0 + %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) + br label %6 + +6: ; preds = %4, %2 + %.0 = phi float [ %3, %2 ], [ %5, %4 ] + ret float %.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #5 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh") +!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 24, column: 33, scope: !7) +!11 = !DILocation(line: 21, column: 28, scope: !7) +!12 = !DILocation(line: 26, column: 30, scope: !7) +!13 = !DILocation(line: 26, column: 35, scope: !7) +!14 = !DILocation(line: 27, column: 18, scope: !7) +!15 = !DILocation(line: 35, column: 44, scope: !7) +!16 = !DILocation(line: 35, column: 40, scope: !7) +!17 = !DILocation(line: 35, column: 34, scope: !7) +!18 = !DILocation(line: 35, column: 50, scope: !7) +!19 = !DILocation(line: 36, column: 22, scope: !7) +!20 = !DILocation(line: 37, column: 22, scope: !7) +!21 = !DILocation(line: 38, column: 36, scope: !7) +!22 = !DILocation(line: 39, column: 40, scope: !7) +!23 = !DILocation(line: 39, column: 55, scope: !7) +!24 = !DILocation(line: 40, column: 44, scope: !7) +!25 = !DILocation(line: 40, column: 40, scope: !7) +!26 = !DILocation(line: 40, column: 34, scope: !7) +!27 = !DILocation(line: 40, column: 52, scope: !7) +!28 = !DILocation(line: 41, column: 22, scope: !7) +!29 = !DILocation(line: 98, column: 22, scope: !30, inlinedAt: !32) +!30 = distinct !DILexicalBlockFile(scope: !7, file: !31, discriminator: 0) +!31 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!32 = !DILocation(line: 44, column: 38, scope: !30) +!33 = !DILocation(line: 101, column: 30, scope: !30, inlinedAt: !32) +!34 = !DILocation(line: 101, column: 22, scope: !30, inlinedAt: !32) +!35 = !DILocation(line: 101, column: 13, scope: !30, inlinedAt: !32) +!36 = !DILocation(line: 108, column: 21, scope: !37, inlinedAt: !38) +!37 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0) +!38 = !DILocation(line: 120, column: 46, scope: !37, inlinedAt: !39) +!39 = !DILocation(line: 50, column: 41, scope: !37) +!40 = !DILocation(line: 110, column: 60, scope: !37, inlinedAt: !38) +!41 = !DILocation(line: 112, column: 25, scope: !37, inlinedAt: !38) +!42 = !DILocation(line: 112, column: 17, scope: !37, inlinedAt: !38) +!43 = !DILocation(line: 113, column: 15, scope: !37, inlinedAt: !38) +!44 = !DILocation(line: 113, column: 30, scope: !37, inlinedAt: !38) +!45 = !DILocation(line: 113, column: 49, scope: !37, inlinedAt: !38) +!46 = !DILocation(line: 113, column: 22, scope: !37, inlinedAt: !38) +!47 = !DILocation(line: 113, column: 38, scope: !37, inlinedAt: !38) +!48 = !DILocation(line: 120, column: 46, scope: !30, inlinedAt: !49) +!49 = !DILocation(line: 50, column: 41, scope: !30) +!50 = !DILocation(line: 109, column: 28, scope: !37, inlinedAt: !38) +!51 = !DILocation(line: 110, column: 39, scope: !37, inlinedAt: !38) +!52 = !DILocation(line: 110, column: 49, scope: !37, inlinedAt: !38) +!53 = !DILocation(line: 59, column: 51, scope: !7) +!54 = !DILocation(line: 60, column: 35, scope: !7) +!55 = !DILocation(line: 60, column: 40, scope: !7) +!56 = !DILocation(line: 64, column: 57, scope: !7) +!57 = !DILocation(line: 65, column: 54, scope: !7) +!58 = !DILocation(line: 69, column: 23, scope: !7) +!59 = !DILocation(line: 71, column: 24, scope: !7) +!60 = !DILocation(line: 72, column: 30, scope: !7) +!61 = !DILocation(line: 66, column: 24, scope: !7) +!62 = !DILocation(line: 67, column: 24, scope: !7) +!63 = !DILocation(line: 73, column: 24, scope: !7) +!64 = !DILocation(line: 74, column: 24, scope: !7) +!65 = !DILocation(line: 76, column: 39, scope: !7) +!66 = !DILocation(line: 76, column: 35, scope: !7) +!67 = !DILocation(line: 76, column: 29, scope: !7) +!68 = !DILocation(line: 76, column: 52, scope: !7) +!69 = !DILocation(line: 55, column: 4, scope: !7) diff --git a/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ttgir b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5618dc9f84257d09aeab684a881fd40e2a3d7a37 --- /dev/null +++ b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ttgir @@ -0,0 +1,81 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x1xf32, #blocked> + %cst_0 = arith.constant dense<-1> : tensor<1x1xi64, #blocked> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x1024xf32, #blocked> + %cst_2 = arith.constant dense<50257> : tensor<1x1024xi64, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c1024_i32 = arith.constant 1024 : i32 + %c50257_i32 = arith.constant 50257 : i32 + %c50257_i64 = arith.constant 50257 : i64 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x1024xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<1024xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x1024xi32, #blocked> + %4 = arith.extsi %3 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> + %5 = tt.addptr %arg1, %1 : !tt.ptr, i64 + %6 = tt.splat %5 : (!tt.ptr) -> tensor<1x1x!tt.ptr, #blocked> + %7 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked> + %8 = tt.addptr %arg2, %c0_i32 : !tt.ptr, i32 + %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %10 = tt.addptr %arg3, %c0_i32 : !tt.ptr, i32 + %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %12 = arith.muli %1, %c50257_i64 : i64 + %13 = tt.splat %12 : (i64) -> tensor<1x1024xi64, #blocked> + %14 = tt.splat %arg0 : (!tt.ptr) -> tensor<1x1024x!tt.ptr, #blocked> + %15 = arith.cmpi ne, %7, %cst_0 : tensor<1x1xi64, #blocked> + %16 = arith.divf %9, %11 : f32 + %17 = tt.splat %16 : (f32) -> tensor<1x1xf32, #blocked> + %18 = arith.select %15, %17, %cst : tensor<1x1xi1, #blocked>, tensor<1x1xf32, #blocked> + %19 = tt.broadcast %18 : (tensor<1x1xf32, #blocked>) -> tensor<1x1024xf32, #blocked> + %20 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c1024_i32 iter_args(%arg10 = %cst_1) -> (tensor<1x1024xf32, #blocked>) : i32 { + %27 = arith.extsi %arg9 : i32 to i64 + %28 = tt.splat %27 : (i64) -> tensor<1x1024xi64, #blocked> + %29 = arith.addi %28, %4 : tensor<1x1024xi64, #blocked> + %30 = arith.cmpi slt, %29, %cst_2 : tensor<1x1024xi64, #blocked> + %31 = arith.addi %29, %13 : tensor<1x1024xi64, #blocked> + %32 = tt.addptr %14, %31 : tensor<1x1024x!tt.ptr, #blocked>, tensor<1x1024xi64, #blocked> + %33 = tt.load %32, %30, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1024xf32, #blocked> + %34 = arith.mulf %33, %19 : tensor<1x1024xf32, #blocked> + %35 = arith.addf %arg10, %34 : tensor<1x1024xf32, #blocked> + %36 = arith.select %30, %35, %arg10 : tensor<1x1024xi1, #blocked>, tensor<1x1024xf32, #blocked> + scf.yield %36 : tensor<1x1024xf32, #blocked> + } + %21 = "tt.reduce"(%20) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %27 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %27 : f32 + }) : (tensor<1x1024xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked> + %23 = tt.splat %arg4 : (!tt.ptr) -> tensor<1x1024x!tt.ptr, #blocked> + %24 = tt.splat %arg5 : (!tt.ptr) -> tensor<1x1024x!tt.ptr, #blocked> + %25 = tt.broadcast %22 : (tensor<1x1xf32, #blocked>) -> tensor<1x1024xf32, #blocked> + %26 = tt.splat %arg6 : (!tt.ptr) -> tensor<1x1024x!tt.ptr, #blocked> + scf.for %arg9 = %c0_i32 to %c50257_i32 step %c1024_i32 : i32 { + %27 = arith.extsi %arg9 : i32 to i64 + %28 = tt.splat %27 : (i64) -> tensor<1x1024xi64, #blocked> + %29 = arith.addi %28, %4 : tensor<1x1024xi64, #blocked> + %30 = arith.cmpi slt, %29, %cst_2 : tensor<1x1024xi64, #blocked> + %31 = arith.addi %29, %13 : tensor<1x1024xi64, #blocked> + %32 = tt.addptr %23, %31 : tensor<1x1024x!tt.ptr, #blocked>, tensor<1x1024xi64, #blocked> + %33 = tt.load %32, %30, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x1024xbf16, #blocked> + %34 = arith.extf %33 : tensor<1x1024xbf16, #blocked> to tensor<1x1024xf32, #blocked> + %35 = tt.addptr %14, %31 : tensor<1x1024x!tt.ptr, #blocked>, tensor<1x1024xi64, #blocked> + %36 = tt.load %35, %30, %cst_1 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x1024xf32, #blocked> + %37 = tt.addptr %24, %31 : tensor<1x1024x!tt.ptr, #blocked>, tensor<1x1024xi64, #blocked> + %38 = tt.load %37, %30, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x1024xbf16, #blocked> + %39 = arith.extf %38 : tensor<1x1024xbf16, #blocked> to tensor<1x1024xf32, #blocked> + %40 = arith.mulf %36, %19 : tensor<1x1024xf32, #blocked> + %41 = math.exp %39 : tensor<1x1024xf32, #blocked> + %42 = arith.mulf %41, %25 : tensor<1x1024xf32, #blocked> + %43 = arith.subf %40, %42 : tensor<1x1024xf32, #blocked> + %44 = arith.addf %34, %43 : tensor<1x1024xf32, #blocked> + %45 = tt.addptr %26, %31 : tensor<1x1024x!tt.ptr, #blocked>, tensor<1x1024xi64, #blocked> + %46 = arith.truncf %44 : tensor<1x1024xf32, #blocked> to tensor<1x1024xbf16, #blocked> + tt.store %45, %46, %30 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1024xbf16, #blocked> + } + tt.return + } +} diff --git a/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.cubin b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..0a00d68188ec2c9dc713ede46d6fbe61f98e5262 Binary files /dev/null and b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.cubin differ diff --git a/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.llir b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..eb8c8d0efdc5ba12954368afe2549303d5234920 --- /dev/null +++ b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.llir @@ -0,0 +1,162 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8] + +define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 { + %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %5 = and i32 %4, 127, !dbg !8 + %6 = shl nuw nsw i32 %5, 3, !dbg !8 + %7 = shl nuw nsw i32 %5, 2, !dbg !8 + %8 = or i32 %7, 512, !dbg !8 + %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9 + %10 = shl i32 %9, 10, !dbg !10 + %11 = or i32 %10, %6, !dbg !11 + %12 = or i32 %10, %7, !dbg !11 + %13 = or i32 %10, %8, !dbg !11 + %14 = sext i32 %11 to i64, !dbg !12 + %15 = getelementptr i16, ptr addrspace(1) %0, i64 %14, !dbg !12 + %16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 true) #2, !dbg !13 + %17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !13 + %18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !13 + %19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !13 + %20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !13 + %21 = trunc i32 %17 to i16, !dbg !13 + %extelt.offset = lshr i32 %17, 16, !dbg !13 + %22 = trunc i32 %extelt.offset to i16, !dbg !13 + %23 = trunc i32 %18 to i16, !dbg !13 + %extelt.offset1 = lshr i32 %18, 16, !dbg !13 + %24 = trunc i32 %extelt.offset1 to i16, !dbg !13 + %25 = trunc i32 %19 to i16, !dbg !13 + %extelt.offset2 = lshr i32 %19, 16, !dbg !13 + %26 = trunc i32 %extelt.offset2 to i16, !dbg !13 + %27 = trunc i32 %20 to i16, !dbg !13 + %extelt.offset3 = lshr i32 %20, 16, !dbg !13 + %28 = trunc i32 %extelt.offset3 to i16, !dbg !13 + %29 = zext nneg i32 %6 to i64, !dbg !14 + %30 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %29, !dbg !14 + %31 = insertelement <1 x i16> undef, i16 %21, i64 0, !dbg !14 + store <1 x i16> %31, ptr addrspace(3) %30, align 2, !dbg !14 + %32 = or i32 %6, 1, !dbg !14 + %33 = zext nneg i32 %32 to i64, !dbg !14 + %34 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %33, !dbg !14 + %35 = insertelement <1 x i16> undef, i16 %22, i64 0, !dbg !14 + store <1 x i16> %35, ptr addrspace(3) %34, align 2, !dbg !14 + %36 = or i32 %6, 2, !dbg !14 + %37 = zext nneg i32 %36 to i64, !dbg !14 + %38 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %37, !dbg !14 + %39 = insertelement <1 x i16> undef, i16 %23, i64 0, !dbg !14 + store <1 x i16> %39, ptr addrspace(3) %38, align 2, !dbg !14 + %40 = or i32 %6, 3, !dbg !14 + %41 = zext nneg i32 %40 to i64, !dbg !14 + %42 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %41, !dbg !14 + %43 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !14 + store <1 x i16> %43, ptr addrspace(3) %42, align 2, !dbg !14 + %44 = or i32 %6, 4, !dbg !14 + %45 = zext nneg i32 %44 to i64, !dbg !14 + %46 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %45, !dbg !14 + %47 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !14 + store <1 x i16> %47, ptr addrspace(3) %46, align 2, !dbg !14 + %48 = or i32 %6, 5, !dbg !14 + %49 = zext nneg i32 %48 to i64, !dbg !14 + %50 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %49, !dbg !14 + %51 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !14 + store <1 x i16> %51, ptr addrspace(3) %50, align 2, !dbg !14 + %52 = or i32 %6, 6, !dbg !14 + %53 = zext nneg i32 %52 to i64, !dbg !14 + %54 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %53, !dbg !14 + %55 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !14 + store <1 x i16> %55, ptr addrspace(3) %54, align 2, !dbg !14 + %56 = or i32 %6, 7, !dbg !14 + %57 = zext nneg i32 %56 to i64, !dbg !14 + %58 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %57, !dbg !14 + %59 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !14 + store <1 x i16> %59, ptr addrspace(3) %58, align 2, !dbg !14 + tail call void @llvm.nvvm.barrier0(), !dbg !14 + %60 = zext nneg i32 %7 to i64, !dbg !14 + %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !14 + %62 = load i16, ptr addrspace(3) %61, align 2, !dbg !14 + %63 = or i32 %7, 1, !dbg !14 + %64 = zext nneg i32 %63 to i64, !dbg !14 + %65 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %64, !dbg !14 + %66 = load i16, ptr addrspace(3) %65, align 2, !dbg !14 + %67 = or i32 %7, 2, !dbg !14 + %68 = zext nneg i32 %67 to i64, !dbg !14 + %69 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %68, !dbg !14 + %70 = load i16, ptr addrspace(3) %69, align 2, !dbg !14 + %71 = or i32 %7, 3, !dbg !14 + %72 = zext nneg i32 %71 to i64, !dbg !14 + %73 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %72, !dbg !14 + %74 = load i16, ptr addrspace(3) %73, align 2, !dbg !14 + %75 = zext nneg i32 %8 to i64, !dbg !14 + %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !14 + %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !14 + %78 = or i32 %7, 513, !dbg !14 + %79 = zext nneg i32 %78 to i64, !dbg !14 + %80 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %79, !dbg !14 + %81 = load i16, ptr addrspace(3) %80, align 2, !dbg !14 + %82 = or i32 %7, 514, !dbg !14 + %83 = zext nneg i32 %82 to i64, !dbg !14 + %84 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %83, !dbg !14 + %85 = load i16, ptr addrspace(3) %84, align 2, !dbg !14 + %86 = or i32 %7, 515, !dbg !14 + %87 = zext nneg i32 %86 to i64, !dbg !14 + %88 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %87, !dbg !14 + %89 = load i16, ptr addrspace(3) %88, align 2, !dbg !14 + %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #2, !dbg !14 + %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %66) #2, !dbg !14 + %92 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #2, !dbg !14 + %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #2, !dbg !14 + %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !14 + %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %81) #2, !dbg !14 + %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #2, !dbg !14 + %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %89) #2, !dbg !14 + %98 = sext i32 %12 to i64, !dbg !15 + %99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !15 + %100 = sext i32 %13 to i64, !dbg !15 + %101 = getelementptr float, ptr addrspace(1) %1, i64 %100, !dbg !15 + %102 = bitcast float %90 to i32, !dbg !16 + %103 = bitcast float %91 to i32, !dbg !16 + %104 = bitcast float %92 to i32, !dbg !16 + %105 = bitcast float %93 to i32, !dbg !16 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %102, i32 %103, i32 %104, i32 %105, ptr addrspace(1) %99, i1 true) #2, !dbg !16 + %106 = bitcast float %94 to i32, !dbg !16 + %107 = bitcast float %95 to i32, !dbg !16 + %108 = bitcast float %96 to i32, !dbg !16 + %109 = bitcast float %97 to i32, !dbg !16 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %106, i32 %107, i32 %108, i32 %109, ptr addrspace(1) %101, i1 true) #2, !dbg !16 + ret void, !dbg !17 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #1 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "cyamhdbxtmf4rgres6uo7orhfzw3ryhsvm5qzdvyqgggck2hqbyi.py", directory: "/tmp/torchinductor_root/ya") +!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 36, scope: !5) +!9 = !DILocation(line: 20, column: 28, scope: !5) +!10 = !DILocation(line: 20, column: 33, scope: !5) +!11 = !DILocation(line: 21, column: 23, scope: !5) +!12 = !DILocation(line: 24, column: 30, scope: !5) +!13 = !DILocation(line: 24, column: 35, scope: !5) +!14 = !DILocation(line: 24, column: 44, scope: !5) +!15 = !DILocation(line: 26, column: 25, scope: !5) +!16 = !DILocation(line: 26, column: 36, scope: !5) +!17 = !DILocation(line: 26, column: 4, scope: !5) diff --git a/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ptx b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..1db8301fffd68efac6af39f4b7932c64d6cafe66 --- /dev/null +++ b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ptx @@ -0,0 +1,338 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u32 triton__0d1d2de_param_2 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<4>; + .reg .b16 %rs<9>; + .reg .b32 %r<37>; + .reg .b64 %rd<13>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd4, [triton__0d1d2de_param_0]; + ld.param.u64 %rd5, [triton__0d1d2de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r22, %tid.x; + and.b32 %r23, %r22, 127; + shl.b32 %r24, %r23, 3; + shl.b32 %r25, %r23, 2; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r26, %r1, 10; + .loc 1 21 23 + or.b32 %r27, %r26, %r24; + or.b32 %r28, %r26, %r25; + .loc 1 24 30 + mul.wide.s32 %rd6, %r27, 2; + add.s64 %rd1, %rd4, %rd6; + mov.pred %p1, -1; + .loc 1 24 35 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + shr.u32 %r29, %r2, 16; + shr.u32 %r30, %r3, 16; + shr.u32 %r31, %r4, 16; + shr.u32 %r32, %r5, 16; + .loc 1 24 44 + shl.b32 %r33, %r23, 4; + mov.u32 %r34, global_smem; + add.s32 %r35, %r34, %r33; + st.shared.u16 [%r35], %r2; + st.shared.u16 [%r35+2], %r29; + st.shared.u16 [%r35+4], %r3; + st.shared.u16 [%r35+6], %r30; + st.shared.u16 [%r35+8], %r4; + st.shared.u16 [%r35+10], %r31; + st.shared.u16 [%r35+12], %r5; + st.shared.u16 [%r35+14], %r32; + bar.sync 0; + add.s32 %r36, %r34, %r24; + ld.shared.u16 %rs1, [%r36]; + ld.shared.u16 %rs2, [%r36+2]; + ld.shared.u16 %rs3, [%r36+4]; + ld.shared.u16 %rs4, [%r36+6]; + ld.shared.u16 %rs5, [%r36+1024]; + ld.shared.u16 %rs6, [%r36+1026]; + ld.shared.u16 %rs7, [%r36+1028]; + ld.shared.u16 %rs8, [%r36+1030]; + cvt.f32.bf16 %r14, %rs1; + cvt.f32.bf16 %r15, %rs2; + cvt.f32.bf16 %r16, %rs3; + cvt.f32.bf16 %r17, %rs4; + cvt.f32.bf16 %r18, %rs5; + cvt.f32.bf16 %r19, %rs6; + cvt.f32.bf16 %r20, %rs7; + cvt.f32.bf16 %r21, %rs8; + .loc 1 26 25 + mul.wide.s32 %rd7, %r28, 4; + add.s64 %rd2, %rd5, %rd7; + cvt.s64.s32 %rd8, %r26; + cvt.u64.u32 %rd9, %r25; + or.b64 %rd10, %rd8, %rd9; + shl.b64 %rd11, %rd10, 2; + add.s64 %rd12, %rd5, %rd11; + add.s64 %rd3, %rd12, 2048; + .loc 1 26 36 + @%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 }; + @%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 }; + .loc 1 26 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/ya/cyamhdbxtmf4rgres6uo7orhfzw3ryhsvm5qzdvyqgggck2hqbyi.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 121 +.b8 97 +.b8 109 +.b8 104 +.b8 100 +.b8 98 +.b8 120 +.b8 116 +.b8 109 +.b8 102 +.b8 52 +.b8 114 +.b8 103 +.b8 114 +.b8 101 +.b8 115 +.b8 54 +.b8 117 +.b8 111 +.b8 55 +.b8 111 +.b8 114 +.b8 104 +.b8 102 +.b8 122 +.b8 119 +.b8 51 +.b8 114 +.b8 121 +.b8 104 +.b8 115 +.b8 118 +.b8 109 +.b8 53 +.b8 113 +.b8 122 +.b8 100 +.b8 118 +.b8 121 +.b8 113 +.b8 103 +.b8 103 +.b8 103 +.b8 99 +.b8 107 +.b8 50 +.b8 104 +.b8 113 +.b8 98 +.b8 121 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 121 +.b8 97 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.cubin b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..7b1915ef6f4b04b11f0f14daa6df8baf1f463140 Binary files /dev/null and b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.cubin differ diff --git a/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.cubin b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a8870742cf1a74365e29e13a76184f39f392569e Binary files /dev/null and b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.cubin differ diff --git a/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ttgir b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..a637ba49a95253d43106bd4af13cb18dde546f08 --- /dev/null +++ b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ttgir @@ -0,0 +1,89 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked> + %cst_1 = arith.constant dense<-1> : tensor<1xi64, #blocked> + %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked> + %cst_3 = arith.constant dense<0> : tensor<1xi64, #blocked> + %cst_4 = arith.constant dense<50257> : tensor<1xi64, #blocked> + %cst_5 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_6 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_7 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked> + %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %12 = tt.load %11, %2, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %16 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %17 = tt.splat %16 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %19 = tt.addptr %arg5, %0 : !tt.ptr, i32 + %20 = tt.splat %19 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %22 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %23 = tt.addptr %22, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %24 = tt.load %23, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %25 = tt.addptr %arg6, %0 : !tt.ptr, i32 + %26 = tt.splat %25 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %27 = tt.load %26 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked> + %28 = arith.mulf %9, %12 : tensor<256xf32, #blocked> + %29 = arith.select %2, %28, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %63 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %63 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %31 = arith.addf %30, %cst_5 : f32 + %32 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %33 = arith.subf %15, %32 : tensor<256xf32, #blocked> + %34 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %35 = arith.mulf %33, %34 : tensor<256xf32, #blocked> + %36 = arith.mulf %28, %35 : tensor<256xf32, #blocked> + %37 = arith.select %2, %36, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %38 = "tt.reduce"(%37) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %63 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %63 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %39 = arith.addf %38, %cst_5 : f32 + %40 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked> + %41 = arith.mulf %28, %cst_7 : tensor<256xf32, #blocked> + %42 = tt.splat %31 : (f32) -> tensor<256xf32, #blocked> + %43 = arith.subf %41, %42 : tensor<256xf32, #blocked> + %44 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked> + %45 = arith.mulf %35, %44 : tensor<256xf32, #blocked> + %46 = arith.subf %43, %45 : tensor<256xf32, #blocked> + %47 = tt.broadcast %40 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %48 = arith.mulf %47, %46 : tensor<256xf32, #blocked> + %49 = arith.addf %24, %48 : tensor<256xf32, #blocked> + %50 = arith.addi %27, %cst_4 : tensor<1xi64, #blocked> + %51 = arith.cmpi slt, %27, %cst_3 : tensor<1xi64, #blocked> + %52 = arith.select %51, %50, %27 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked> + %53 = arith.cmpi eq, %27, %cst_1 : tensor<1xi64, #blocked> + %54 = tt.broadcast %53 : (tensor<1xi1, #blocked>) -> tensor<256xi1, #blocked> + %55 = arith.select %54, %cst_6, %49 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + tt.store %23, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %56 = arith.muli %52, %cst_2 : tensor<1xi64, #blocked> + %57 = tt.broadcast %56 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked> + %58 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> + %59 = arith.addi %58, %57 : tensor<256xi64, #blocked> + %60 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %61 = tt.addptr %60, %59 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> + %62 = "tt.atomic_rmw"(%61, %55, %2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<256x!tt.ptr, #blocked>, tensor<256xf32, #blocked>, tensor<256xi1, #blocked>) -> tensor<256xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.llir b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..66db9c191fb1d24bcf079ba9cb444fd0f758172e --- /dev/null +++ b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.llir @@ -0,0 +1,610 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, ptr addrspace(1) %11, ptr addrspace(1) %12, ptr addrspace(1) %13, ptr addrspace(1) %14, ptr addrspace(1) %15, ptr addrspace(1) %16, ptr addrspace(1) %17, ptr addrspace(1) %18, ptr addrspace(1) %19, ptr addrspace(1) %20, ptr addrspace(1) %21, ptr addrspace(1) %22, ptr addrspace(1) %23, ptr addrspace(1) %24, ptr addrspace(1) %25, ptr addrspace(1) %26, ptr addrspace(1) %27, ptr addrspace(1) %28, i32 %29, i32 %30) local_unnamed_addr !dbg !5 { + %32 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %33 = and i32 %32, 31, !dbg !8 + %34 = lshr i32 %32, 5, !dbg !8 + %35 = and i32 %34, 1, !dbg !8 + %urem = shl i32 %32, 2, !dbg !8 + %36 = and i32 %urem, 252, !dbg !8 + %37 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 + %38 = shl i32 %37, 8, !dbg !10 + %39 = or i32 %38, %36, !dbg !11 + %40 = sext i32 %39 to i64, !dbg !12 + %41 = getelementptr float, ptr addrspace(1) %0, i64 %40, !dbg !12 + %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13 + %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !13 + %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !13 + %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !13 + %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !13 + %47 = bitcast i32 %43 to float, !dbg !13 + %48 = bitcast i32 %44 to float, !dbg !13 + %49 = bitcast i32 %45 to float, !dbg !13 + %50 = bitcast i32 %46 to float, !dbg !13 + %51 = getelementptr i16, ptr addrspace(1) %1, i64 %40, !dbg !14 + %52 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %51, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !15 + %53 = extractvalue { i32, i32 } %52, 0, !dbg !15 + %54 = extractvalue { i32, i32 } %52, 1, !dbg !15 + %55 = trunc i32 %53 to i16, !dbg !15 + %extelt.offset = lshr i32 %53, 16, !dbg !15 + %56 = trunc i32 %extelt.offset to i16, !dbg !15 + %57 = trunc i32 %54 to i16, !dbg !15 + %extelt.offset1 = lshr i32 %54, 16, !dbg !15 + %58 = trunc i32 %extelt.offset1 to i16, !dbg !15 + %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #3, !dbg !16 + %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #3, !dbg !16 + %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #3, !dbg !16 + %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #3, !dbg !16 + %63 = getelementptr i16, ptr addrspace(1) %2, i64 %40, !dbg !17 + %64 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %63, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18 + %65 = extractvalue { i32, i32 } %64, 0, !dbg !18 + %66 = extractvalue { i32, i32 } %64, 1, !dbg !18 + %67 = trunc i32 %65 to i16, !dbg !18 + %extelt.offset2 = lshr i32 %65, 16, !dbg !18 + %68 = trunc i32 %extelt.offset2 to i16, !dbg !18 + %69 = trunc i32 %66 to i16, !dbg !18 + %extelt.offset3 = lshr i32 %66, 16, !dbg !18 + %70 = trunc i32 %extelt.offset3 to i16, !dbg !18 + %71 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %67) #3, !dbg !19 + %72 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %68) #3, !dbg !19 + %73 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %69) #3, !dbg !19 + %74 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #3, !dbg !19 + %75 = sext i32 %37 to i64, !dbg !20 + %76 = getelementptr float, ptr addrspace(1) %3, i64 %75, !dbg !20 + %77 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !21 + %78 = bitcast i32 %77 to float, !dbg !21 + %79 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !21 + %80 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !21 + %81 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !21 + %82 = getelementptr float, ptr addrspace(1) %4, i64 %75, !dbg !22 + %83 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !23 + %84 = bitcast i32 %83 to float, !dbg !23 + %85 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !23 + %86 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !23 + %87 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !23 + %88 = getelementptr i16, ptr addrspace(1) %5, i64 %40, !dbg !24 + %89 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %88, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !25 + %90 = extractvalue { i32, i32 } %89, 0, !dbg !25 + %91 = extractvalue { i32, i32 } %89, 1, !dbg !25 + %92 = trunc i32 %90 to i16, !dbg !25 + %extelt.offset4 = lshr i32 %90, 16, !dbg !25 + %93 = trunc i32 %extelt.offset4 to i16, !dbg !25 + %94 = trunc i32 %91 to i16, !dbg !25 + %extelt.offset5 = lshr i32 %91, 16, !dbg !25 + %95 = trunc i32 %extelt.offset5 to i16, !dbg !25 + %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %92) #3, !dbg !26 + %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %93) #3, !dbg !26 + %98 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %94) #3, !dbg !26 + %99 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %95) #3, !dbg !26 + %100 = getelementptr float, ptr addrspace(1) %6, i64 %75, !dbg !27 + %101 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %100, i1 true) #3, !dbg !28 + %102 = bitcast i32 %101 to float, !dbg !28 + %103 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %100, i1 true) #3, !dbg !28 + %104 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %100, i1 true) #3, !dbg !28 + %105 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %100, i1 true) #3, !dbg !28 + %106 = getelementptr float, ptr addrspace(1) %7, i64 %75, !dbg !29 + %107 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %106, i1 true) #3, !dbg !30 + %108 = bitcast i32 %107 to float, !dbg !30 + %109 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %106, i1 true) #3, !dbg !30 + %110 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %106, i1 true) #3, !dbg !30 + %111 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %106, i1 true) #3, !dbg !30 + %112 = getelementptr i16, ptr addrspace(1) %8, i64 %40, !dbg !31 + %113 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %112, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !32 + %114 = extractvalue { i32, i32 } %113, 0, !dbg !32 + %115 = extractvalue { i32, i32 } %113, 1, !dbg !32 + %116 = trunc i32 %114 to i16, !dbg !32 + %extelt.offset6 = lshr i32 %114, 16, !dbg !32 + %117 = trunc i32 %extelt.offset6 to i16, !dbg !32 + %118 = trunc i32 %115 to i16, !dbg !32 + %extelt.offset7 = lshr i32 %115, 16, !dbg !32 + %119 = trunc i32 %extelt.offset7 to i16, !dbg !32 + %120 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %116) #3, !dbg !33 + %121 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %117) #3, !dbg !33 + %122 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %118) #3, !dbg !33 + %123 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %119) #3, !dbg !33 + %124 = getelementptr i16, ptr addrspace(1) %9, i64 %40, !dbg !34 + %125 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %124, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !35 + %126 = extractvalue { i32, i32 } %125, 0, !dbg !35 + %127 = extractvalue { i32, i32 } %125, 1, !dbg !35 + %128 = trunc i32 %126 to i16, !dbg !35 + %extelt.offset8 = lshr i32 %126, 16, !dbg !35 + %129 = trunc i32 %extelt.offset8 to i16, !dbg !35 + %130 = trunc i32 %127 to i16, !dbg !35 + %extelt.offset9 = lshr i32 %127, 16, !dbg !35 + %131 = trunc i32 %extelt.offset9 to i16, !dbg !35 + %132 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %128) #3, !dbg !36 + %133 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %129) #3, !dbg !36 + %134 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %130) #3, !dbg !36 + %135 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %131) #3, !dbg !36 + %136 = getelementptr i16, ptr addrspace(1) %10, i64 %40, !dbg !37 + %137 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %136, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !38 + %138 = extractvalue { i32, i32 } %137, 0, !dbg !38 + %139 = extractvalue { i32, i32 } %137, 1, !dbg !38 + %140 = trunc i32 %138 to i16, !dbg !38 + %extelt.offset10 = lshr i32 %138, 16, !dbg !38 + %141 = trunc i32 %extelt.offset10 to i16, !dbg !38 + %142 = trunc i32 %139 to i16, !dbg !38 + %extelt.offset11 = lshr i32 %139, 16, !dbg !38 + %143 = trunc i32 %extelt.offset11 to i16, !dbg !38 + %144 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %140) #3, !dbg !39 + %145 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %141) #3, !dbg !39 + %146 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %142) #3, !dbg !39 + %147 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %143) #3, !dbg !39 + %148 = getelementptr float, ptr addrspace(1) %11, i64 %75, !dbg !40 + %149 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %148, i1 true) #3, !dbg !41 + %150 = bitcast i32 %149 to float, !dbg !41 + %151 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %148, i1 true) #3, !dbg !41 + %152 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %148, i1 true) #3, !dbg !41 + %153 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %148, i1 true) #3, !dbg !41 + %154 = getelementptr float, ptr addrspace(1) %12, i64 %75, !dbg !42 + %155 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %154, i1 true) #3, !dbg !43 + %156 = bitcast i32 %155 to float, !dbg !43 + %157 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %154, i1 true) #3, !dbg !43 + %158 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %154, i1 true) #3, !dbg !43 + %159 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %154, i1 true) #3, !dbg !43 + %160 = getelementptr i16, ptr addrspace(1) %13, i64 %40, !dbg !44 + %161 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %160, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !45 + %162 = extractvalue { i32, i32 } %161, 0, !dbg !45 + %163 = extractvalue { i32, i32 } %161, 1, !dbg !45 + %164 = trunc i32 %162 to i16, !dbg !45 + %extelt.offset12 = lshr i32 %162, 16, !dbg !45 + %165 = trunc i32 %extelt.offset12 to i16, !dbg !45 + %166 = trunc i32 %163 to i16, !dbg !45 + %extelt.offset13 = lshr i32 %163, 16, !dbg !45 + %167 = trunc i32 %extelt.offset13 to i16, !dbg !45 + %168 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %164) #3, !dbg !46 + %169 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %165) #3, !dbg !46 + %170 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %166) #3, !dbg !46 + %171 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %167) #3, !dbg !46 + %172 = getelementptr float, ptr addrspace(1) %14, i64 %75, !dbg !47 + %173 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %172, i1 true) #3, !dbg !48 + %174 = bitcast i32 %173 to float, !dbg !48 + %175 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %172, i1 true) #3, !dbg !48 + %176 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %172, i1 true) #3, !dbg !48 + %177 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %172, i1 true) #3, !dbg !48 + %178 = getelementptr float, ptr addrspace(1) %15, i64 %75, !dbg !49 + %179 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %178, i1 true) #3, !dbg !50 + %180 = bitcast i32 %179 to float, !dbg !50 + %181 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %178, i1 true) #3, !dbg !50 + %182 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %178, i1 true) #3, !dbg !50 + %183 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %178, i1 true) #3, !dbg !50 + %184 = getelementptr i16, ptr addrspace(1) %16, i64 %40, !dbg !51 + %185 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !52 + %186 = extractvalue { i32, i32 } %185, 0, !dbg !52 + %187 = extractvalue { i32, i32 } %185, 1, !dbg !52 + %188 = trunc i32 %186 to i16, !dbg !52 + %extelt.offset14 = lshr i32 %186, 16, !dbg !52 + %189 = trunc i32 %extelt.offset14 to i16, !dbg !52 + %190 = trunc i32 %187 to i16, !dbg !52 + %extelt.offset15 = lshr i32 %187, 16, !dbg !52 + %191 = trunc i32 %extelt.offset15 to i16, !dbg !52 + %192 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %188) #3, !dbg !53 + %193 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %189) #3, !dbg !53 + %194 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %190) #3, !dbg !53 + %195 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %191) #3, !dbg !53 + %196 = getelementptr float, ptr addrspace(1) %17, i64 %75, !dbg !54 + %197 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %196, i1 true) #3, !dbg !55 + %198 = bitcast i32 %197 to float, !dbg !55 + %199 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %196, i1 true) #3, !dbg !55 + %200 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %196, i1 true) #3, !dbg !55 + %201 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %196, i1 true) #3, !dbg !55 + %202 = getelementptr float, ptr addrspace(1) %18, i64 %75, !dbg !56 + %203 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %202, i1 true) #3, !dbg !57 + %204 = bitcast i32 %203 to float, !dbg !57 + %205 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %202, i1 true) #3, !dbg !57 + %206 = bitcast i32 %205 to float, !dbg !57 + %207 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %202, i1 true) #3, !dbg !57 + %208 = bitcast i32 %207 to float, !dbg !57 + %209 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %202, i1 true) #3, !dbg !57 + %210 = bitcast i32 %209 to float, !dbg !57 + %211 = getelementptr float, ptr addrspace(1) %19, i64 %40, !dbg !58 + %212 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %211, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !59 + %213 = extractvalue { i32, i32, i32, i32 } %212, 0, !dbg !59 + %214 = extractvalue { i32, i32, i32, i32 } %212, 1, !dbg !59 + %215 = extractvalue { i32, i32, i32, i32 } %212, 2, !dbg !59 + %216 = extractvalue { i32, i32, i32, i32 } %212, 3, !dbg !59 + %217 = zext nneg i32 %36 to i64, !dbg !60 + %218 = getelementptr float, ptr addrspace(1) %20, i64 %217, !dbg !60 + %219 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %218, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !61 + %220 = extractvalue { i32, i32, i32, i32 } %219, 0, !dbg !61 + %221 = extractvalue { i32, i32, i32, i32 } %219, 1, !dbg !61 + %222 = extractvalue { i32, i32, i32, i32 } %219, 2, !dbg !61 + %223 = extractvalue { i32, i32, i32, i32 } %219, 3, !dbg !61 + %224 = fadd float %59, %47, !dbg !62 + %225 = fadd float %60, %48, !dbg !62 + %226 = fadd float %61, %49, !dbg !62 + %227 = fadd float %62, %50, !dbg !62 + %228 = fadd float %224, %71, !dbg !63 + %229 = fadd float %225, %72, !dbg !63 + %230 = fadd float %226, %73, !dbg !63 + %231 = fadd float %227, %74, !dbg !63 + %232 = fsub float %228, %78, !dbg !64 + %233 = fsub float %229, %78, !dbg !64 + %234 = fsub float %230, %78, !dbg !64 + %235 = fsub float %231, %78, !dbg !64 + %236 = fmul float %232, %84, !dbg !65 + %237 = fmul float %233, %84, !dbg !65 + %238 = fmul float %234, %84, !dbg !65 + %239 = fmul float %235, %84, !dbg !65 + %240 = fadd float %228, %96, !dbg !66 + %241 = fadd float %229, %97, !dbg !66 + %242 = fadd float %230, %98, !dbg !66 + %243 = fadd float %231, %99, !dbg !66 + %244 = fsub float %240, %102, !dbg !67 + %245 = fsub float %241, %102, !dbg !67 + %246 = fsub float %242, %102, !dbg !67 + %247 = fsub float %243, %102, !dbg !67 + %248 = fmul float %244, %108, !dbg !68 + %249 = fmul float %245, %108, !dbg !68 + %250 = fmul float %246, %108, !dbg !68 + %251 = fmul float %247, %108, !dbg !68 + %252 = fadd float %240, %120, !dbg !69 + %253 = fadd float %241, %121, !dbg !69 + %254 = fadd float %242, %122, !dbg !69 + %255 = fadd float %243, %123, !dbg !69 + %256 = fadd float %252, %132, !dbg !70 + %257 = fadd float %253, %133, !dbg !70 + %258 = fadd float %254, %134, !dbg !70 + %259 = fadd float %255, %135, !dbg !70 + %260 = fadd float %256, %144, !dbg !71 + %261 = fadd float %257, %145, !dbg !71 + %262 = fadd float %258, %146, !dbg !71 + %263 = fadd float %259, %147, !dbg !71 + %264 = fsub float %260, %150, !dbg !72 + %265 = fsub float %261, %150, !dbg !72 + %266 = fsub float %262, %150, !dbg !72 + %267 = fsub float %263, %150, !dbg !72 + %268 = fmul float %264, %156, !dbg !73 + %269 = fmul float %265, %156, !dbg !73 + %270 = fmul float %266, %156, !dbg !73 + %271 = fmul float %267, %156, !dbg !73 + %272 = fadd float %260, %168, !dbg !74 + %273 = fadd float %261, %169, !dbg !74 + %274 = fadd float %262, %170, !dbg !74 + %275 = fadd float %263, %171, !dbg !74 + %276 = fsub float %272, %174, !dbg !75 + %277 = fsub float %273, %174, !dbg !75 + %278 = fsub float %274, %174, !dbg !75 + %279 = fsub float %275, %174, !dbg !75 + %280 = fmul float %276, %180, !dbg !76 + %281 = fmul float %277, %180, !dbg !76 + %282 = fmul float %278, %180, !dbg !76 + %283 = fmul float %279, %180, !dbg !76 + %284 = fadd float %272, %192, !dbg !77 + %285 = fadd float %273, %193, !dbg !77 + %286 = fadd float %274, %194, !dbg !77 + %287 = fadd float %275, %195, !dbg !77 + %288 = fsub float %284, %198, !dbg !78 + %289 = fsub float %285, %198, !dbg !78 + %290 = fsub float %286, %198, !dbg !78 + %291 = fsub float %287, %198, !dbg !78 + %292 = fmul float %288, %204, !dbg !79 + %293 = fmul float %289, %204, !dbg !79 + %294 = fmul float %290, %204, !dbg !79 + %295 = fmul float %291, %204, !dbg !79 + %296 = insertelement <2 x i32> poison, i32 %213, i64 0, !dbg !59 + %297 = insertelement <2 x i32> %296, i32 %214, i64 1, !dbg !59 + %298 = bitcast <2 x i32> %297 to <2 x float>, !dbg !59 + %299 = insertelement <2 x i32> poison, i32 %220, i64 0, !dbg !61 + %300 = insertelement <2 x i32> %299, i32 %221, i64 1, !dbg !61 + %301 = bitcast <2 x i32> %300 to <2 x float>, !dbg !61 + %302 = fmul <2 x float> %298, %301, !dbg !80 + %303 = insertelement <2 x i32> poison, i32 %216, i64 0, !dbg !59 + %304 = insertelement <2 x i32> %303, i32 %215, i64 1, !dbg !59 + %305 = bitcast <2 x i32> %304 to <2 x float>, !dbg !59 + %306 = insertelement <2 x i32> poison, i32 %223, i64 0, !dbg !61 + %307 = insertelement <2 x i32> %306, i32 %222, i64 1, !dbg !61 + %308 = bitcast <2 x i32> %307 to <2 x float>, !dbg !61 + %309 = fmul <2 x float> %305, %308, !dbg !80 + %310 = extractelement <2 x float> %302, i64 0, !dbg !81 + %311 = extractelement <2 x float> %302, i64 1, !dbg !81 + %312 = fadd float %310, %311, !dbg !81 + %313 = extractelement <2 x float> %309, i64 1, !dbg !81 + %314 = fadd float %313, %312, !dbg !81 + %315 = extractelement <2 x float> %309, i64 0, !dbg !81 + %316 = fadd float %315, %314, !dbg !81 + %317 = bitcast float %316 to i32, !dbg !87 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 16, i32 31), !dbg !87 + %319 = bitcast i32 %318 to float, !dbg !87 + %320 = fadd float %316, %319, !dbg !81 + %321 = bitcast float %320 to i32, !dbg !87 + %322 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 8, i32 31), !dbg !87 + %323 = bitcast i32 %322 to float, !dbg !87 + %324 = fadd float %320, %323, !dbg !81 + %325 = bitcast float %324 to i32, !dbg !87 + %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 4, i32 31), !dbg !87 + %327 = bitcast i32 %326 to float, !dbg !87 + %328 = fadd float %324, %327, !dbg !81 + %329 = bitcast float %328 to i32, !dbg !87 + %330 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %329, i32 2, i32 31), !dbg !87 + %331 = bitcast i32 %330 to float, !dbg !87 + %332 = fadd float %328, %331, !dbg !81 + %333 = bitcast float %332 to i32, !dbg !87 + %334 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %333, i32 1, i32 31), !dbg !87 + %335 = bitcast i32 %334 to float, !dbg !87 + %336 = fadd float %332, %335, !dbg !81 + %337 = icmp eq i32 %33, 0, !dbg !87 + %338 = zext nneg i32 %35 to i64, !dbg !87 + %339 = getelementptr float, ptr addrspace(3) @global_smem, i64 %338, !dbg !87 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %339, float %336, i1 %337) #3, !dbg !87 + tail call void @llvm.nvvm.barrier0(), !dbg !87 + %340 = icmp slt i32 %32, 2, !dbg !87 + %341 = sext i32 %32 to i64, !dbg !87 + %342 = getelementptr float, ptr addrspace(3) @global_smem, i64 %341, !dbg !87 + %343 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %342, i1 %340) #3, !dbg !87 + %344 = bitcast float %343 to i32, !dbg !87 + %345 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %344, i32 1, i32 31), !dbg !87 + %346 = bitcast i32 %345 to float, !dbg !87 + %347 = fadd float %343, %346, !dbg !81 + %348 = and i32 %32, 1, !dbg !87 + %349 = icmp eq i32 %348, 0, !dbg !87 + %350 = and i1 %340, %349, !dbg !87 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %342, float %347, i1 %350) #3, !dbg !87 + tail call void @llvm.nvvm.barrier0(), !dbg !87 + %351 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !87 + %352 = fadd float %351, 0.000000e+00, !dbg !89 + %353 = fmul float %292, %310, !dbg !93 + %354 = fmul float %293, %311, !dbg !93 + %355 = fmul float %294, %313, !dbg !93 + %356 = fmul float %295, %315, !dbg !93 + tail call void @llvm.nvvm.barrier0(), !dbg !94 + %357 = fadd float %353, %354, !dbg !96 + %358 = fadd float %355, %357, !dbg !96 + %359 = fadd float %356, %358, !dbg !96 + %360 = bitcast float %359 to i32, !dbg !94 + %361 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %360, i32 16, i32 31), !dbg !94 + %362 = bitcast i32 %361 to float, !dbg !94 + %363 = fadd float %359, %362, !dbg !96 + %364 = bitcast float %363 to i32, !dbg !94 + %365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %364, i32 8, i32 31), !dbg !94 + %366 = bitcast i32 %365 to float, !dbg !94 + %367 = fadd float %363, %366, !dbg !96 + %368 = bitcast float %367 to i32, !dbg !94 + %369 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %368, i32 4, i32 31), !dbg !94 + %370 = bitcast i32 %369 to float, !dbg !94 + %371 = fadd float %367, %370, !dbg !96 + %372 = bitcast float %371 to i32, !dbg !94 + %373 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %372, i32 2, i32 31), !dbg !94 + %374 = bitcast i32 %373 to float, !dbg !94 + %375 = fadd float %371, %374, !dbg !96 + %376 = bitcast float %375 to i32, !dbg !94 + %377 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %376, i32 1, i32 31), !dbg !94 + %378 = bitcast i32 %377 to float, !dbg !94 + %379 = fadd float %375, %378, !dbg !96 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %339, float %379, i1 %337) #3, !dbg !94 + tail call void @llvm.nvvm.barrier0(), !dbg !94 + %380 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %342, i1 %340) #3, !dbg !94 + %381 = bitcast float %380 to i32, !dbg !94 + %382 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %381, i32 1, i32 31), !dbg !94 + %383 = bitcast i32 %382 to float, !dbg !94 + %384 = fadd float %380, %383, !dbg !96 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %342, float %384, i1 %350) #3, !dbg !94 + tail call void @llvm.nvvm.barrier0(), !dbg !94 + %385 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !94 + %386 = fadd float %385, 0.000000e+00, !dbg !99 + %387 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float 2.560000e+02) #3, !dbg !101 + %388 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %206, float 2.560000e+02) #3, !dbg !101 + %389 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %208, float 2.560000e+02) #3, !dbg !101 + %390 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %210, float 2.560000e+02) #3, !dbg !101 + %391 = fmul float %310, 2.560000e+02, !dbg !102 + %392 = fmul float %311, 2.560000e+02, !dbg !102 + %393 = fmul float %313, 2.560000e+02, !dbg !102 + %394 = fmul float %315, 2.560000e+02, !dbg !102 + %395 = fsub float %391, %352, !dbg !103 + %396 = fsub float %392, %352, !dbg !103 + %397 = fsub float %393, %352, !dbg !103 + %398 = fsub float %394, %352, !dbg !103 + %399 = fmul float %292, %386, !dbg !104 + %400 = fmul float %293, %386, !dbg !104 + %401 = fmul float %294, %386, !dbg !104 + %402 = fmul float %295, %386, !dbg !104 + %403 = fsub float %395, %399, !dbg !105 + %404 = fsub float %396, %400, !dbg !105 + %405 = fsub float %397, %401, !dbg !105 + %406 = fsub float %398, %402, !dbg !105 + %407 = fmul float %387, %403, !dbg !106 + %408 = fmul float %387, %404, !dbg !106 + %409 = fmul float %387, %405, !dbg !106 + %410 = fmul float %387, %406, !dbg !106 + %411 = getelementptr float, ptr addrspace(1) %21, i64 %40, !dbg !107 + %412 = bitcast float %236 to i32, !dbg !108 + %413 = bitcast float %237 to i32, !dbg !108 + %414 = bitcast float %238 to i32, !dbg !108 + %415 = bitcast float %239 to i32, !dbg !108 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %412, i32 %413, i32 %414, i32 %415, ptr addrspace(1) %411, i1 true) #3, !dbg !108 + %416 = getelementptr float, ptr addrspace(1) %22, i64 %40, !dbg !109 + %417 = bitcast float %248 to i32, !dbg !110 + %418 = bitcast float %249 to i32, !dbg !110 + %419 = bitcast float %250 to i32, !dbg !110 + %420 = bitcast float %251 to i32, !dbg !110 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %417, i32 %418, i32 %419, i32 %420, ptr addrspace(1) %416, i1 true) #3, !dbg !110 + %421 = getelementptr float, ptr addrspace(1) %23, i64 %40, !dbg !111 + %422 = bitcast float %252 to i32, !dbg !112 + %423 = bitcast float %253 to i32, !dbg !112 + %424 = bitcast float %254 to i32, !dbg !112 + %425 = bitcast float %255 to i32, !dbg !112 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %422, i32 %423, i32 %424, i32 %425, ptr addrspace(1) %421, i1 true) #3, !dbg !112 + %426 = getelementptr float, ptr addrspace(1) %24, i64 %40, !dbg !113 + %427 = bitcast float %268 to i32, !dbg !114 + %428 = bitcast float %269 to i32, !dbg !114 + %429 = bitcast float %270 to i32, !dbg !114 + %430 = bitcast float %271 to i32, !dbg !114 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %427, i32 %428, i32 %429, i32 %430, ptr addrspace(1) %426, i1 true) #3, !dbg !114 + %431 = getelementptr float, ptr addrspace(1) %25, i64 %40, !dbg !115 + %432 = bitcast float %280 to i32, !dbg !116 + %433 = bitcast float %281 to i32, !dbg !116 + %434 = bitcast float %282 to i32, !dbg !116 + %435 = bitcast float %283 to i32, !dbg !116 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %432, i32 %433, i32 %434, i32 %435, ptr addrspace(1) %431, i1 true) #3, !dbg !116 + %436 = getelementptr float, ptr addrspace(1) %26, i64 %40, !dbg !117 + %437 = bitcast float %292 to i32, !dbg !118 + %438 = bitcast float %293 to i32, !dbg !118 + %439 = bitcast float %294 to i32, !dbg !118 + %440 = bitcast float %295 to i32, !dbg !118 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %437, i32 %438, i32 %439, i32 %440, ptr addrspace(1) %436, i1 true) #3, !dbg !118 + %441 = getelementptr float, ptr addrspace(1) %27, i64 %40, !dbg !119 + %442 = bitcast float %407 to i32, !dbg !120 + %443 = bitcast float %408 to i32, !dbg !120 + %444 = bitcast float %409 to i32, !dbg !120 + %445 = bitcast float %410 to i32, !dbg !120 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %442, i32 %443, i32 %444, i32 %445, ptr addrspace(1) %441, i1 true) #3, !dbg !120 + %446 = getelementptr i16, ptr addrspace(1) %28, i64 %40, !dbg !121 + %447 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %407) #3, !dbg !122 + %448 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %408) #3, !dbg !122 + %449 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %409) #3, !dbg !122 + %450 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %410) #3, !dbg !122 + %451 = insertelement <2 x i16> undef, i16 %447, i64 0, !dbg !122 + %452 = insertelement <2 x i16> %451, i16 %448, i64 1, !dbg !122 + %453 = bitcast <2 x i16> %452 to i32, !dbg !122 + %454 = insertelement <2 x i16> undef, i16 %449, i64 0, !dbg !122 + %455 = insertelement <2 x i16> %454, i16 %450, i64 1, !dbg !122 + %456 = bitcast <2 x i16> %455 to i32, !dbg !122 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %453, i32 %456, ptr addrspace(1) %446, i1 true) #3, !dbg !122 + ret void, !dbg !123 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "cyo4ksjyladdfw6jgu5nyxbapyihb5b54nc6mogi76rx2lajsiff.py", directory: "/tmp/torchinductor_root/yo") +!3 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de, !"maxntidx", i32 64} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de", linkageName: "triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 26, column: 26, scope: !5) +!9 = !DILocation(line: 23, column: 28, scope: !5) +!10 = !DILocation(line: 30, column: 40, scope: !5) +!11 = !DILocation(line: 30, column: 36, scope: !5) +!12 = !DILocation(line: 30, column: 30, scope: !5) +!13 = !DILocation(line: 30, column: 46, scope: !5) +!14 = !DILocation(line: 31, column: 30, scope: !5) +!15 = !DILocation(line: 31, column: 46, scope: !5) +!16 = !DILocation(line: 31, column: 67, scope: !5) +!17 = !DILocation(line: 32, column: 30, scope: !5) +!18 = !DILocation(line: 32, column: 46, scope: !5) +!19 = !DILocation(line: 32, column: 67, scope: !5) +!20 = !DILocation(line: 33, column: 30, scope: !5) +!21 = !DILocation(line: 33, column: 35, scope: !5) +!22 = !DILocation(line: 34, column: 30, scope: !5) +!23 = !DILocation(line: 34, column: 35, scope: !5) +!24 = !DILocation(line: 35, column: 31, scope: !5) +!25 = !DILocation(line: 35, column: 47, scope: !5) +!26 = !DILocation(line: 35, column: 68, scope: !5) +!27 = !DILocation(line: 36, column: 31, scope: !5) +!28 = !DILocation(line: 36, column: 36, scope: !5) +!29 = !DILocation(line: 37, column: 31, scope: !5) +!30 = !DILocation(line: 37, column: 36, scope: !5) +!31 = !DILocation(line: 38, column: 31, scope: !5) +!32 = !DILocation(line: 38, column: 47, scope: !5) +!33 = !DILocation(line: 38, column: 68, scope: !5) +!34 = !DILocation(line: 39, column: 31, scope: !5) +!35 = !DILocation(line: 39, column: 47, scope: !5) +!36 = !DILocation(line: 39, column: 68, scope: !5) +!37 = !DILocation(line: 40, column: 32, scope: !5) +!38 = !DILocation(line: 40, column: 48, scope: !5) +!39 = !DILocation(line: 40, column: 69, scope: !5) +!40 = !DILocation(line: 41, column: 32, scope: !5) +!41 = !DILocation(line: 41, column: 37, scope: !5) +!42 = !DILocation(line: 42, column: 32, scope: !5) +!43 = !DILocation(line: 42, column: 37, scope: !5) +!44 = !DILocation(line: 43, column: 32, scope: !5) +!45 = !DILocation(line: 43, column: 48, scope: !5) +!46 = !DILocation(line: 43, column: 69, scope: !5) +!47 = !DILocation(line: 44, column: 32, scope: !5) +!48 = !DILocation(line: 44, column: 37, scope: !5) +!49 = !DILocation(line: 45, column: 32, scope: !5) +!50 = !DILocation(line: 45, column: 37, scope: !5) +!51 = !DILocation(line: 46, column: 32, scope: !5) +!52 = !DILocation(line: 46, column: 48, scope: !5) +!53 = !DILocation(line: 46, column: 69, scope: !5) +!54 = !DILocation(line: 47, column: 32, scope: !5) +!55 = !DILocation(line: 47, column: 37, scope: !5) +!56 = !DILocation(line: 48, column: 32, scope: !5) +!57 = !DILocation(line: 48, column: 37, scope: !5) +!58 = !DILocation(line: 49, column: 32, scope: !5) +!59 = !DILocation(line: 49, column: 48, scope: !5) +!60 = !DILocation(line: 50, column: 32, scope: !5) +!61 = !DILocation(line: 50, column: 37, scope: !5) +!62 = !DILocation(line: 52, column: 18, scope: !5) +!63 = !DILocation(line: 54, column: 18, scope: !5) +!64 = !DILocation(line: 55, column: 18, scope: !5) +!65 = !DILocation(line: 56, column: 19, scope: !5) +!66 = !DILocation(line: 58, column: 19, scope: !5) +!67 = !DILocation(line: 59, column: 20, scope: !5) +!68 = !DILocation(line: 60, column: 20, scope: !5) +!69 = !DILocation(line: 62, column: 20, scope: !5) +!70 = !DILocation(line: 64, column: 20, scope: !5) +!71 = !DILocation(line: 66, column: 20, scope: !5) +!72 = !DILocation(line: 67, column: 20, scope: !5) +!73 = !DILocation(line: 68, column: 20, scope: !5) +!74 = !DILocation(line: 70, column: 20, scope: !5) +!75 = !DILocation(line: 71, column: 20, scope: !5) +!76 = !DILocation(line: 72, column: 20, scope: !5) +!77 = !DILocation(line: 74, column: 20, scope: !5) +!78 = !DILocation(line: 75, column: 20, scope: !5) +!79 = !DILocation(line: 76, column: 20, scope: !5) +!80 = !DILocation(line: 77, column: 20, scope: !5) +!81 = !DILocation(line: 233, column: 15, scope: !82, inlinedAt: !85) +!82 = distinct !DILexicalBlockFile(scope: !84, file: !83, discriminator: 0) +!83 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!84 = distinct !DILexicalBlockFile(scope: !5, file: !83, discriminator: 0) +!85 = !DILocation(line: 243, column: 36, scope: !82, inlinedAt: !86) +!86 = !DILocation(line: 80, column: 59, scope: !82) +!87 = !DILocation(line: 243, column: 36, scope: !84, inlinedAt: !88) +!88 = !DILocation(line: 80, column: 59, scope: !84) +!89 = !DILocation(line: 8, column: 15, scope: !90, inlinedAt: !92) +!90 = distinct !DILexicalBlockFile(scope: !5, file: !91, discriminator: 0) +!91 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!92 = !DILocation(line: 80, column: 45, scope: !90) +!93 = !DILocation(line: 81, column: 20, scope: !5) +!94 = !DILocation(line: 243, column: 36, scope: !84, inlinedAt: !95) +!95 = !DILocation(line: 84, column: 59, scope: !84) +!96 = !DILocation(line: 233, column: 15, scope: !82, inlinedAt: !97) +!97 = !DILocation(line: 243, column: 36, scope: !82, inlinedAt: !98) +!98 = !DILocation(line: 84, column: 59, scope: !82) +!99 = !DILocation(line: 8, column: 15, scope: !90, inlinedAt: !100) +!100 = !DILocation(line: 84, column: 45, scope: !90) +!101 = !DILocation(line: 86, column: 20, scope: !5) +!102 = !DILocation(line: 87, column: 20, scope: !5) +!103 = !DILocation(line: 88, column: 20, scope: !5) +!104 = !DILocation(line: 89, column: 20, scope: !5) +!105 = !DILocation(line: 90, column: 20, scope: !5) +!106 = !DILocation(line: 91, column: 20, scope: !5) +!107 = !DILocation(line: 93, column: 25, scope: !5) +!108 = !DILocation(line: 93, column: 48, scope: !5) +!109 = !DILocation(line: 94, column: 25, scope: !5) +!110 = !DILocation(line: 94, column: 48, scope: !5) +!111 = !DILocation(line: 95, column: 25, scope: !5) +!112 = !DILocation(line: 95, column: 48, scope: !5) +!113 = !DILocation(line: 96, column: 25, scope: !5) +!114 = !DILocation(line: 96, column: 48, scope: !5) +!115 = !DILocation(line: 97, column: 25, scope: !5) +!116 = !DILocation(line: 97, column: 48, scope: !5) +!117 = !DILocation(line: 98, column: 25, scope: !5) +!118 = !DILocation(line: 98, column: 48, scope: !5) +!119 = !DILocation(line: 99, column: 25, scope: !5) +!120 = !DILocation(line: 99, column: 48, scope: !5) +!121 = !DILocation(line: 100, column: 25, scope: !5) +!122 = !DILocation(line: 100, column: 48, scope: !5) +!123 = !DILocation(line: 100, column: 4, scope: !5) diff --git a/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ttgir b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..845b90d47888616d8876066ba0ab5c85fb673044 --- /dev/null +++ b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ttgir @@ -0,0 +1,168 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: !tt.ptr {tt.divisibility = 16 : i32}, %arg11: !tt.ptr {tt.divisibility = 16 : i32}, %arg12: !tt.ptr {tt.divisibility = 16 : i32}, %arg13: !tt.ptr {tt.divisibility = 16 : i32}, %arg14: !tt.ptr {tt.divisibility = 16 : i32}, %arg15: !tt.ptr {tt.divisibility = 16 : i32}, %arg16: !tt.ptr {tt.divisibility = 16 : i32}, %arg17: !tt.ptr {tt.divisibility = 16 : i32}, %arg18: !tt.ptr {tt.divisibility = 16 : i32}, %arg19: !tt.ptr {tt.divisibility = 16 : i32}, %arg20: !tt.ptr {tt.divisibility = 16 : i32}, %arg21: !tt.ptr {tt.divisibility = 16 : i32}, %arg22: !tt.ptr {tt.divisibility = 16 : i32}, %arg23: !tt.ptr {tt.divisibility = 16 : i32}, %arg24: !tt.ptr {tt.divisibility = 16 : i32}, %arg25: !tt.ptr {tt.divisibility = 16 : i32}, %arg26: !tt.ptr {tt.divisibility = 16 : i32}, %arg27: !tt.ptr {tt.divisibility = 16 : i32}, %arg28: !tt.ptr {tt.divisibility = 16 : i32}, %arg29: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg30: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked> + %cst_1 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %17 = tt.addptr %arg3, %0 : !tt.ptr, i32 + %18 = tt.splat %17 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %19 = tt.load %18 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %20 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %21 = tt.splat %20 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %23 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %24 = tt.addptr %23, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %25 = tt.load %24, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %26 = arith.extf %25 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %27 = tt.addptr %arg6, %0 : !tt.ptr, i32 + %28 = tt.splat %27 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %29 = tt.load %28 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %30 = tt.addptr %arg7, %0 : !tt.ptr, i32 + %31 = tt.splat %30 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %32 = tt.load %31 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %33 = tt.splat %arg8 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %35 = tt.load %34, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %36 = arith.extf %35 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %37 = tt.splat %arg9 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %38 = tt.addptr %37, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %39 = tt.load %38, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %40 = arith.extf %39 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %41 = tt.splat %arg10 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %42 = tt.addptr %41, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %43 = tt.load %42, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %44 = arith.extf %43 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %45 = tt.addptr %arg11, %0 : !tt.ptr, i32 + %46 = tt.splat %45 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %47 = tt.load %46 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %48 = tt.addptr %arg12, %0 : !tt.ptr, i32 + %49 = tt.splat %48 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %50 = tt.load %49 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %51 = tt.splat %arg13 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %52 = tt.addptr %51, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %53 = tt.load %52, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %54 = arith.extf %53 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %55 = tt.addptr %arg14, %0 : !tt.ptr, i32 + %56 = tt.splat %55 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %57 = tt.load %56 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %58 = tt.addptr %arg15, %0 : !tt.ptr, i32 + %59 = tt.splat %58 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %60 = tt.load %59 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %61 = tt.splat %arg16 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %62 = tt.addptr %61, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %63 = tt.load %62, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %64 = arith.extf %63 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %65 = tt.addptr %arg17, %0 : !tt.ptr, i32 + %66 = tt.splat %65 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %67 = tt.load %66 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %68 = tt.addptr %arg18, %0 : !tt.ptr, i32 + %69 = tt.splat %68 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %70 = tt.load %69 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %71 = tt.splat %arg19 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %72 = tt.addptr %71, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %73 = tt.load %72, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %74 = tt.splat %arg20 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %75 = tt.addptr %74, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %76 = tt.load %75, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %77 = arith.addf %8, %12 : tensor<256xf32, #blocked> + %78 = arith.addf %77, %16 : tensor<256xf32, #blocked> + %79 = tt.broadcast %19 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %80 = arith.subf %78, %79 : tensor<256xf32, #blocked> + %81 = tt.broadcast %22 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %82 = arith.mulf %80, %81 : tensor<256xf32, #blocked> + %83 = arith.addf %78, %26 : tensor<256xf32, #blocked> + %84 = tt.broadcast %29 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %85 = arith.subf %83, %84 : tensor<256xf32, #blocked> + %86 = tt.broadcast %32 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %87 = arith.mulf %85, %86 : tensor<256xf32, #blocked> + %88 = arith.addf %83, %36 : tensor<256xf32, #blocked> + %89 = arith.addf %88, %40 : tensor<256xf32, #blocked> + %90 = arith.addf %89, %44 : tensor<256xf32, #blocked> + %91 = tt.broadcast %47 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %92 = arith.subf %90, %91 : tensor<256xf32, #blocked> + %93 = tt.broadcast %50 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %94 = arith.mulf %92, %93 : tensor<256xf32, #blocked> + %95 = arith.addf %90, %54 : tensor<256xf32, #blocked> + %96 = tt.broadcast %57 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %97 = arith.subf %95, %96 : tensor<256xf32, #blocked> + %98 = tt.broadcast %60 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %99 = arith.mulf %97, %98 : tensor<256xf32, #blocked> + %100 = arith.addf %95, %64 : tensor<256xf32, #blocked> + %101 = tt.broadcast %67 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %102 = arith.subf %100, %101 : tensor<256xf32, #blocked> + %103 = tt.broadcast %70 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %104 = arith.mulf %102, %103 : tensor<256xf32, #blocked> + %105 = arith.mulf %73, %76 : tensor<256xf32, #blocked> + %106 = arith.select %2, %105, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %107 = "tt.reduce"(%106) <{axis = 0 : i32}> ({ + ^bb0(%arg31: f32, %arg32: f32): + %139 = arith.addf %arg31, %arg32 : f32 + tt.reduce.return %139 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %108 = arith.addf %107, %cst_1 : f32 + %109 = arith.mulf %105, %104 : tensor<256xf32, #blocked> + %110 = arith.select %2, %109, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %111 = "tt.reduce"(%110) <{axis = 0 : i32}> ({ + ^bb0(%arg31: f32, %arg32: f32): + %139 = arith.addf %arg31, %arg32 : f32 + tt.reduce.return %139 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %112 = arith.addf %111, %cst_1 : f32 + %113 = arith.divf %70, %cst_0 : tensor<1xf32, #blocked> + %114 = arith.mulf %105, %cst_3 : tensor<256xf32, #blocked> + %115 = tt.splat %108 : (f32) -> tensor<256xf32, #blocked> + %116 = arith.subf %114, %115 : tensor<256xf32, #blocked> + %117 = tt.splat %112 : (f32) -> tensor<256xf32, #blocked> + %118 = arith.mulf %104, %117 : tensor<256xf32, #blocked> + %119 = arith.subf %116, %118 : tensor<256xf32, #blocked> + %120 = tt.broadcast %113 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %121 = arith.mulf %120, %119 : tensor<256xf32, #blocked> + %122 = tt.splat %arg21 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %123 = tt.addptr %122, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %123, %82, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %124 = tt.splat %arg22 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %125 = tt.addptr %124, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %125, %87, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %126 = tt.splat %arg23 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %127 = tt.addptr %126, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %127, %88, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %128 = tt.splat %arg24 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %129 = tt.addptr %128, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %129, %94, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %130 = tt.splat %arg25 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %131 = tt.addptr %130, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %131, %99, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %132 = tt.splat %arg26 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %133 = tt.addptr %132, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %133, %104, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %134 = tt.splat %arg27 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %135 = tt.addptr %134, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %135, %121, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %136 = tt.splat %arg28 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %137 = tt.addptr %136, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %138 = arith.truncf %121 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %137, %138, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.llir b/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..117a2548007476a20318e27cacd0b0080d3b8f34 --- /dev/null +++ b/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.llir @@ -0,0 +1,111 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +define void @triton__0d1d2d3d4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = shl i32 %6, 1, !dbg !8 + %8 = and i32 %7, 510, !dbg !8 + %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9 + %10 = shl i32 %9, 9, !dbg !10 + %11 = or i32 %10, %8, !dbg !11 + %.frozen = freeze i32 %11 + %12 = sdiv i32 %.frozen, 256, !dbg !12 + %13 = srem i32 %12, 3, !dbg !13 + %14 = mul i32 %12, 256 + %.decomposed = sub i32 %.frozen, %14 + %15 = sdiv i32 %11, 768, !dbg !14 + %16 = shl nsw i32 %15, 8, !dbg !15 + %17 = add nsw i32 %16, %.decomposed, !dbg !16 + %18 = sext i32 %17 to i64, !dbg !17 + %19 = getelementptr i16, ptr addrspace(1) %0, i64 %18, !dbg !17 + %20 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %19, i1 true) #1, !dbg !18 + %21 = trunc i32 %20 to i16, !dbg !18 + %extelt.offset = lshr i32 %20, 16, !dbg !18 + %22 = trunc i32 %extelt.offset to i16, !dbg !18 + %23 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #1, !dbg !19 + %24 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #1, !dbg !19 + %25 = getelementptr i16, ptr addrspace(1) %1, i64 %18, !dbg !20 + %26 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %25, i1 true) #1, !dbg !21 + %27 = trunc i32 %26 to i16, !dbg !21 + %extelt.offset1 = lshr i32 %26, 16, !dbg !21 + %28 = trunc i32 %extelt.offset1 to i16, !dbg !21 + %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #1, !dbg !22 + %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %28) #1, !dbg !22 + %31 = getelementptr i16, ptr addrspace(1) %2, i64 %18, !dbg !23 + %32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %31, i1 true) #1, !dbg !24 + %33 = trunc i32 %32 to i16, !dbg !24 + %extelt.offset2 = lshr i32 %32, 16, !dbg !24 + %34 = trunc i32 %extelt.offset2 to i16, !dbg !24 + %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #1, !dbg !25 + %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #1, !dbg !25 + %37 = icmp eq i32 %13, 2, !dbg !26 + %38 = select i1 %37, float %23, float 0.000000e+00, !dbg !27 + %39 = select i1 %37, float %24, float 0.000000e+00, !dbg !27 + %40 = icmp eq i32 %13, 1, !dbg !28 + %41 = select i1 %40, float %29, float 0.000000e+00, !dbg !29 + %42 = select i1 %40, float %30, float 0.000000e+00, !dbg !29 + %43 = fadd float %38, %41, !dbg !30 + %44 = fadd float %39, %42, !dbg !30 + %45 = icmp eq i32 %13, 0, !dbg !31 + %46 = select i1 %45, float %35, float 0.000000e+00, !dbg !32 + %47 = select i1 %45, float %36, float 0.000000e+00, !dbg !32 + %48 = fadd float %43, %46, !dbg !33 + %49 = fadd float %44, %47, !dbg !33 + %50 = sext i32 %11 to i64, !dbg !34 + %51 = getelementptr i16, ptr addrspace(1) %3, i64 %50, !dbg !34 + %52 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %48) #1, !dbg !35 + %53 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %49) #1, !dbg !35 + %54 = insertelement <2 x i16> undef, i16 %52, i64 0, !dbg !35 + %55 = insertelement <2 x i16> %54, i16 %53, i64 1, !dbg !35 + %56 = bitcast <2 x i16> %55 to i32, !dbg !35 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %56, ptr addrspace(1) %51, i1 true) #1, !dbg !35 + ret void, !dbg !36 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py", directory: "/tmp/torchinductor_root/63") +!3 = !{ptr @triton__0d1d2d3d4de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4de, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4de", linkageName: "triton__0d1d2d3d4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 36, scope: !5) +!9 = !DILocation(line: 20, column: 28, scope: !5) +!10 = !DILocation(line: 20, column: 33, scope: !5) +!11 = !DILocation(line: 21, column: 23, scope: !5) +!12 = !DILocation(line: 23, column: 20, scope: !5) +!13 = !DILocation(line: 23, column: 27, scope: !5) +!14 = !DILocation(line: 25, column: 20, scope: !5) +!15 = !DILocation(line: 27, column: 40, scope: !5) +!16 = !DILocation(line: 27, column: 36, scope: !5) +!17 = !DILocation(line: 27, column: 30, scope: !5) +!18 = !DILocation(line: 27, column: 46, scope: !5) +!19 = !DILocation(line: 27, column: 85, scope: !5) +!20 = !DILocation(line: 28, column: 30, scope: !5) +!21 = !DILocation(line: 28, column: 46, scope: !5) +!22 = !DILocation(line: 28, column: 85, scope: !5) +!23 = !DILocation(line: 29, column: 31, scope: !5) +!24 = !DILocation(line: 29, column: 47, scope: !5) +!25 = !DILocation(line: 29, column: 86, scope: !5) +!26 = !DILocation(line: 32, column: 19, scope: !5) +!27 = !DILocation(line: 34, column: 32, scope: !5) +!28 = !DILocation(line: 36, column: 19, scope: !5) +!29 = !DILocation(line: 37, column: 32, scope: !5) +!30 = !DILocation(line: 38, column: 19, scope: !5) +!31 = !DILocation(line: 40, column: 20, scope: !5) +!32 = !DILocation(line: 41, column: 35, scope: !5) +!33 = !DILocation(line: 42, column: 20, scope: !5) +!34 = !DILocation(line: 43, column: 25, scope: !5) +!35 = !DILocation(line: 43, column: 37, scope: !5) +!36 = !DILocation(line: 43, column: 4, scope: !5) diff --git a/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.ptx b/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..f93dac3e024acef49f76bb830400d9b58e41b3c8 --- /dev/null +++ b/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.ptx @@ -0,0 +1,387 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4de + +.visible .entry triton__0d1d2d3d4de( + .param .u64 triton__0d1d2d3d4de_param_0, + .param .u64 triton__0d1d2d3d4de_param_1, + .param .u64 triton__0d1d2d3d4de_param_2, + .param .u64 triton__0d1d2d3d4de_param_3, + .param .u32 triton__0d1d2d3d4de_param_4 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<8>; + .reg .b16 %rs<9>; + .reg .b32 %r<38>; + .reg .f32 %f<17>; + .reg .b64 %rd<11>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd5, [triton__0d1d2d3d4de_param_0]; + ld.param.u64 %rd6, [triton__0d1d2d3d4de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r14, %tid.x; + shl.b32 %r15, %r14, 1; + ld.param.u64 %rd7, [triton__0d1d2d3d4de_param_2]; + and.b32 %r16, %r15, 510; + ld.param.u64 %rd8, [triton__0d1d2d3d4de_param_3]; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r17, %r1, 9; + .loc 1 21 23 + or.b32 %r18, %r17, %r16; + .loc 1 23 20 + shr.s32 %r20, %r18, 31; + shr.u32 %r21, %r20, 24; + add.s32 %r22, %r18, %r21; + shr.s32 %r23, %r22, 8; + .loc 1 23 27 + mul.hi.s32 %r24, %r23, 1431655766; + shr.u32 %r25, %r24, 31; + add.s32 %r26, %r24, %r25; + mul.lo.s32 %r27, %r26, 3; + sub.s32 %r28, %r23, %r27; + and.b32 %r29, %r22, -256; + sub.s32 %r30, %r18, %r29; + .loc 1 25 20 + mul.hi.s32 %r31, %r18, 715827883; + shr.u32 %r32, %r31, 31; + shr.u32 %r33, %r31, 7; + add.s32 %r34, %r33, %r32; + .loc 1 27 40 + shl.b32 %r35, %r34, 8; + .loc 1 27 36 + add.s32 %r36, %r35, %r30; + .loc 1 27 30 + mul.wide.s32 %rd9, %r36, 2; + add.s64 %rd1, %rd5, %rd9; + mov.pred %p1, -1; + .loc 1 27 46 + mov.u32 %r2, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r2 }, [ %rd1 + 0 ]; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + .loc 1 27 85 + cvt.f32.bf16 %r3, %rs1; + mov.b32 %f1, %r3; + cvt.f32.bf16 %r4, %rs2; + mov.b32 %f2, %r4; + .loc 1 28 30 + add.s64 %rd2, %rd6, %rd9; + .loc 1 28 46 + mov.u32 %r5, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r5 }, [ %rd2 + 0 ]; + cvt.u16.u32 %rs3, %r5; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r5; } + .loc 1 28 85 + cvt.f32.bf16 %r6, %rs3; + mov.b32 %f3, %r6; + cvt.f32.bf16 %r7, %rs4; + mov.b32 %f4, %r7; + .loc 1 29 31 + add.s64 %rd3, %rd7, %rd9; + .loc 1 29 47 + mov.u32 %r8, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r8 }, [ %rd3 + 0 ]; + cvt.u16.u32 %rs5, %r8; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r8; } + .loc 1 29 86 + cvt.f32.bf16 %r9, %rs5; + mov.b32 %f5, %r9; + cvt.f32.bf16 %r10, %rs6; + mov.b32 %f6, %r10; + .loc 1 32 19 + setp.eq.s32 %p5, %r28, 2; + .loc 1 34 32 + selp.f32 %f7, %f1, 0f00000000, %p5; + selp.f32 %f8, %f2, 0f00000000, %p5; + .loc 1 36 19 + setp.eq.s32 %p6, %r28, 1; + .loc 1 37 32 + selp.f32 %f9, %f3, 0f00000000, %p6; + selp.f32 %f10, %f4, 0f00000000, %p6; + .loc 1 38 19 + add.f32 %f11, %f7, %f9; + add.f32 %f12, %f8, %f10; + .loc 1 40 20 + setp.eq.s32 %p7, %r28, 0; + .loc 1 41 35 + selp.f32 %f13, %f5, 0f00000000, %p7; + selp.f32 %f14, %f6, 0f00000000, %p7; + .loc 1 42 20 + add.f32 %f15, %f11, %f13; + add.f32 %f16, %f12, %f14; + .loc 1 43 25 + mul.wide.s32 %rd10, %r18, 2; + add.s64 %rd4, %rd8, %rd10; + .loc 1 43 37 + mov.b32 %r11, %f15; + cvt.rn.bf16.f32 %rs7, %r11; + mov.b32 %r12, %f16; + cvt.rn.bf16.f32 %rs8, %r12; + mov.b32 %r37, {%rs7, %rs8}; + @%p1 st.global.b32 [ %rd4 + 0 ], { %r37 }; + .loc 1 43 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/63/c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 184 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 51 +.b8 114 +.b8 55 +.b8 105 +.b8 117 +.b8 114 +.b8 119 +.b8 107 +.b8 53 +.b8 121 +.b8 100 +.b8 108 +.b8 115 +.b8 119 +.b8 104 +.b8 55 +.b8 114 +.b8 118 +.b8 104 +.b8 99 +.b8 109 +.b8 108 +.b8 120 +.b8 50 +.b8 99 +.b8 102 +.b8 114 +.b8 101 +.b8 116 +.b8 108 +.b8 114 +.b8 101 +.b8 119 +.b8 103 +.b8 119 +.b8 54 +.b8 116 +.b8 108 +.b8 106 +.b8 108 +.b8 117 +.b8 114 +.b8 115 +.b8 115 +.b8 104 +.b8 103 +.b8 116 +.b8 102 +.b8 112 +.b8 112 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 51 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 188 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 188 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.ttgir b/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..418903e8c378ddfa6f0757e9a62ca3863b0458d4 --- /dev/null +++ b/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.ttgir @@ -0,0 +1,49 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<512xi32, #blocked> + %cst_0 = arith.constant dense<3> : tensor<512xi32, #blocked> + %cst_1 = arith.constant dense<768> : tensor<512xi32, #blocked> + %cst_2 = arith.constant dense<2> : tensor<512xi32, #blocked> + %cst_3 = arith.constant dense<0> : tensor<512xi32, #blocked> + %cst_4 = arith.constant dense<1> : tensor<512xi32, #blocked> + %cst_5 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<512xi32, #blocked> + %5 = arith.divsi %4, %cst : tensor<512xi32, #blocked> + %6 = arith.remsi %5, %cst_0 : tensor<512xi32, #blocked> + %7 = arith.remsi %4, %cst : tensor<512xi32, #blocked> + %8 = arith.divsi %4, %cst_1 : tensor<512xi32, #blocked> + %9 = arith.muli %8, %cst : tensor<512xi32, #blocked> + %10 = arith.addi %7, %9 : tensor<512xi32, #blocked> + %11 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %12 = tt.addptr %11, %10 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<512xbf16, #blocked> + %14 = arith.extf %13 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> + %15 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %16 = tt.addptr %15, %10 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<512xbf16, #blocked> + %18 = arith.extf %17 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> + %19 = tt.splat %arg2 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %20 = tt.addptr %19, %10 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<512xbf16, #blocked> + %22 = arith.extf %21 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> + %23 = arith.cmpi eq, %6, %cst_2 : tensor<512xi32, #blocked> + %24 = arith.select %23, %14, %cst_5 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> + %25 = arith.cmpi eq, %6, %cst_4 : tensor<512xi32, #blocked> + %26 = arith.select %25, %18, %cst_5 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> + %27 = arith.addf %24, %26 : tensor<512xf32, #blocked> + %28 = arith.cmpi eq, %6, %cst_3 : tensor<512xi32, #blocked> + %29 = arith.select %28, %22, %cst_5 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> + %30 = arith.addf %27, %29 : tensor<512xf32, #blocked> + %31 = tt.splat %arg3 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %32 = tt.addptr %31, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + %33 = arith.truncf %30 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> + tt.store %32, %33 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.ttir b/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..b71eedeae3981e272e19b3c61c07d24ffd576f9c --- /dev/null +++ b/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.ttir @@ -0,0 +1,48 @@ +module { + tt.func public @triton__0d1d2d3d4de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<512xi32> + %cst_0 = arith.constant dense<1> : tensor<512xi32> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<512xf32> + %cst_2 = arith.constant dense<2> : tensor<512xi32> + %cst_3 = arith.constant dense<768> : tensor<512xi32> + %cst_4 = arith.constant dense<3> : tensor<512xi32> + %cst_5 = arith.constant dense<256> : tensor<512xi32> + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> + %3 = tt.splat %1 : (i32) -> tensor<512xi32> + %4 = arith.addi %3, %2 : tensor<512xi32> + %5 = arith.divsi %4, %cst_5 : tensor<512xi32> + %6 = arith.remsi %5, %cst_4 : tensor<512xi32> + %7 = arith.remsi %4, %cst_5 : tensor<512xi32> + %8 = arith.divsi %4, %cst_3 : tensor<512xi32> + %9 = arith.muli %8, %cst_5 : tensor<512xi32> + %10 = arith.addi %7, %9 : tensor<512xi32> + %11 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr> + %12 = tt.addptr %11, %10 : tensor<512x!tt.ptr>, tensor<512xi32> + %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<512xbf16> + %14 = arith.extf %13 : tensor<512xbf16> to tensor<512xf32> + %15 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr> + %16 = tt.addptr %15, %10 : tensor<512x!tt.ptr>, tensor<512xi32> + %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<512xbf16> + %18 = arith.extf %17 : tensor<512xbf16> to tensor<512xf32> + %19 = tt.splat %arg2 : (!tt.ptr) -> tensor<512x!tt.ptr> + %20 = tt.addptr %19, %10 : tensor<512x!tt.ptr>, tensor<512xi32> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<512xbf16> + %22 = arith.extf %21 : tensor<512xbf16> to tensor<512xf32> + %23 = arith.cmpi eq, %6, %cst_2 : tensor<512xi32> + %24 = arith.select %23, %14, %cst_1 : tensor<512xi1>, tensor<512xf32> + %25 = arith.cmpi eq, %6, %cst_0 : tensor<512xi32> + %26 = arith.select %25, %18, %cst_1 : tensor<512xi1>, tensor<512xf32> + %27 = arith.addf %24, %26 : tensor<512xf32> + %28 = arith.cmpi eq, %6, %cst : tensor<512xi32> + %29 = arith.select %28, %22, %cst_1 : tensor<512xi1>, tensor<512xf32> + %30 = arith.addf %27, %29 : tensor<512xf32> + %31 = tt.splat %arg3 : (!tt.ptr) -> tensor<512x!tt.ptr> + %32 = tt.addptr %31, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + %33 = arith.truncf %30 : tensor<512xf32> to tensor<512xbf16> + tt.store %32, %33 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16> + tt.return + } +} diff --git a/wandb/run-20240926_180814-1klxtkie/logs/debug-internal.log b/wandb/run-20240926_180814-1klxtkie/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..9fa60cacd6e59a75776d1fc68705e8247d14ccde --- /dev/null +++ b/wandb/run-20240926_180814-1klxtkie/logs/debug-internal.log @@ -0,0 +1,24 @@ +{"time":"2024-09-26T18:08:14.296778164Z","level":"INFO","msg":"using version","core version":"0.18.1"} +{"time":"2024-09-26T18:08:14.296827848Z","level":"INFO","msg":"created symlink","path":"/root/wandb/run-20240926_180814-1klxtkie/logs/debug-core.log"} +{"time":"2024-09-26T18:08:14.296930672Z","level":"INFO","msg":"using version","core version":"0.18.1"} +{"time":"2024-09-26T18:08:14.29694264Z","level":"INFO","msg":"created symlink","path":"/root/wandb/run-20240926_180814-1klxtkie/logs/debug-core.log"} +{"time":"2024-09-26T18:08:14.300052084Z","level":"INFO","msg":"created new stream","id":"1klxtkie"} +{"time":"2024-09-26T18:08:14.300091673Z","level":"INFO","msg":"stream: started","id":"1klxtkie"} +{"time":"2024-09-26T18:08:14.300138614Z","level":"INFO","msg":"sender: started","stream_id":{"value":"1klxtkie"}} +{"time":"2024-09-26T18:08:14.300147136Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"1klxtkie"}} +{"time":"2024-09-26T18:08:14.300284362Z","level":"INFO","msg":"handler: started","stream_id":{"value":"1klxtkie"}} +{"time":"2024-09-26T18:08:14.897132609Z","level":"INFO","msg":"wandb-core","!BADKEY":null} +{"time":"2024-09-26T18:08:14.89985244Z","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-09-26T19:56:45.23312372Z","level":"INFO","msg":"api: retrying HTTP request, no error or response"} +{"time":"2024-09-26T21:53:47.961476555Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/tulasiram/gpt2_positional_encodings_10B/1klxtkie/file_stream"} +{"time":"2024-09-26T23:32:45.458917296Z","level":"INFO","msg":"api: retrying HTTP request, no error or response"} +{"time":"2024-09-27T00:54:21.660977667Z","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-09-27T00:54:21.66127363Z","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-09-27T00:54:22.109427259Z","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2024-09-27T00:54:22.109445727Z","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2024-09-27T00:54:22.109450685Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2024-09-27T00:54:24.403877609Z","level":"INFO","msg":"stream: closing","id":"1klxtkie"} +{"time":"2024-09-27T00:54:24.40391152Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"1klxtkie"}} +{"time":"2024-09-27T00:54:24.403937999Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"1klxtkie"}} +{"time":"2024-09-27T00:54:24.403979391Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"1klxtkie"}} +{"time":"2024-09-27T00:54:24.404135705Z","level":"INFO","msg":"stream: closed","id":"1klxtkie"} diff --git a/wandb/run-20240927_005424-60260ulk/files/output.log b/wandb/run-20240927_005424-60260ulk/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..846e58ae25c086268a1e1e1a72f990eaa48d0966 --- /dev/null +++ b/wandb/run-20240927_005424-60260ulk/files/output.log @@ -0,0 +1,701 @@ +Training polynomial_legendre + default: 5%|▎ | 500/10000 [19:50<5:33:40, 2.11s/it, loss=5.5479, lr=5.98e-04, mfu=9.57%, time_per_iter_ms=2107.69ms] + +Step 100: +Train loss: 7.4737, Val loss: 7.4699 +wikitext-103-v1 - Train loss: 8.3942, Val loss: 8.3996 +ptb - Train loss: 8.1018, Val loss: 8.1061 +lambada - Train loss: 7.3301, Val loss: 7.3265 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 200: +Train loss: 6.4669, Val loss: 6.4581 +wikitext-103-v1 - Train loss: 7.6986, Val loss: 7.6981 +ptb - Train loss: 7.9491, Val loss: 7.9708 +lambada - Train loss: 6.2100, Val loss: 6.2135 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 300: +Train loss: 6.0532, Val loss: 6.0477 +wikitext-103-v1 - Train loss: 7.4305, Val loss: 7.4272 +ptb - Train loss: 7.8318, Val loss: 7.8674 +lambada - Train loss: 5.7627, Val loss: 5.7725 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 400: +Train loss: 5.7497, Val loss: 5.7519 +wikitext-103-v1 - Train loss: 7.2072, Val loss: 7.2038 +ptb - Train loss: 7.5535, Val loss: 7.5784 +lambada - Train loss: 5.5935, Val loss: 5.6010 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 500: +Train loss: 5.5168, Val loss: 5.5149 +wikitext-103-v1 - Train loss: 6.9888, Val loss: 6.9952 +ptb - Train loss: 7.3114, Val loss: 7.3424 +lambada - Train loss: 5.4025, Val loss: 5.4101 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 600: +Train loss: 5.3115, Val loss: 5.3138 +wikitext-103-v1 - Train loss: 6.8503, Val loss: 6.8503 +ptb - Train loss: 7.1324, Val loss: 7.1702 +lambada - Train loss: 5.2585, Val loss: 5.2682 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 700: +Train loss: 5.1637, Val loss: 5.1685 +wikitext-103-v1 - Train loss: 6.7540, Val loss: 6.7488 +ptb - Train loss: 7.0577, Val loss: 7.1079 +lambada - Train loss: 5.1652, Val loss: 5.1726 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 800: +Train loss: 5.0372, Val loss: 5.0465 +wikitext-103-v1 - Train loss: 6.6109, Val loss: 6.5986 +ptb - Train loss: 6.8470, Val loss: 6.8943 +lambada - Train loss: 5.0726, Val loss: 5.0837 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 900: +Train loss: 4.9138, Val loss: 4.9066 +wikitext-103-v1 - Train loss: 6.4477, Val loss: 6.4434 +ptb - Train loss: 6.7143, Val loss: 6.7681 +lambada - Train loss: 5.0109, Val loss: 5.0292 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 1000: +Train loss: 4.8004, Val loss: 4.7951 +wikitext-103-v1 - Train loss: 6.2217, Val loss: 6.2116 +ptb - Train loss: 6.3391, Val loss: 6.4022 +lambada - Train loss: 4.9635, Val loss: 4.9739 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 1100: +Train loss: 4.7042, Val loss: 4.7059 +wikitext-103-v1 - Train loss: 6.0525, Val loss: 6.0399 +ptb - Train loss: 5.9756, Val loss: 6.0549 +lambada - Train loss: 4.9094, Val loss: 4.9171 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 1200: +Train loss: 4.6451, Val loss: 4.6400 +wikitext-103-v1 - Train loss: 5.9364, Val loss: 5.9276 +ptb - Train loss: 5.8161, Val loss: 5.9000 +lambada - Train loss: 4.8897, Val loss: 4.8993 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 1300: +Train loss: 4.5584, Val loss: 4.5602 +wikitext-103-v1 - Train loss: 5.7837, Val loss: 5.7676 +ptb - Train loss: 5.6750, Val loss: 5.7531 +lambada - Train loss: 4.8295, Val loss: 4.8356 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 1400: +Train loss: 4.5008, Val loss: 4.5021 +wikitext-103-v1 - Train loss: 5.7216, Val loss: 5.7111 +ptb - Train loss: 5.5782, Val loss: 5.6721 +lambada - Train loss: 4.8072, Val loss: 4.8152 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 1500: +Train loss: 4.4688, Val loss: 4.4713 +wikitext-103-v1 - Train loss: 5.6502, Val loss: 5.6472 +ptb - Train loss: 5.5434, Val loss: 5.6402 +lambada - Train loss: 4.7909, Val loss: 4.7969 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 1600: +Train loss: 4.4411, Val loss: 4.4383 +wikitext-103-v1 - Train loss: 5.6296, Val loss: 5.6237 +ptb - Train loss: 5.5138, Val loss: 5.6008 +lambada - Train loss: 4.7731, Val loss: 4.7760 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 1700: +Train loss: 4.4035, Val loss: 4.4052 +wikitext-103-v1 - Train loss: 5.5661, Val loss: 5.5557 +ptb - Train loss: 5.4606, Val loss: 5.5561 +lambada - Train loss: 4.7418, Val loss: 4.7489 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 1800: +Train loss: 4.3867, Val loss: 4.3817 +wikitext-103-v1 - Train loss: 5.5264, Val loss: 5.5131 +ptb - Train loss: 5.4284, Val loss: 5.5195 +lambada - Train loss: 4.7461, Val loss: 4.7497 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 1900: +Train loss: 4.3501, Val loss: 4.3555 +wikitext-103-v1 - Train loss: 5.5002, Val loss: 5.4934 +ptb - Train loss: 5.4079, Val loss: 5.4945 +lambada - Train loss: 4.7118, Val loss: 4.7206 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 2000: +Train loss: 4.3295, Val loss: 4.3282 +wikitext-103-v1 - Train loss: 5.4934, Val loss: 5.4834 +ptb - Train loss: 5.3946, Val loss: 5.4838 +lambada - Train loss: 4.7160, Val loss: 4.7172 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 2100: +Train loss: 4.3026, Val loss: 4.3030 +wikitext-103-v1 - Train loss: 5.4312, Val loss: 5.4205 +ptb - Train loss: 5.3493, Val loss: 5.4330 +lambada - Train loss: 4.6839, Val loss: 4.6957 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 2200: +Train loss: 4.2893, Val loss: 4.2895 +wikitext-103-v1 - Train loss: 5.4201, Val loss: 5.4085 +ptb - Train loss: 5.3570, Val loss: 5.4569 +lambada - Train loss: 4.6692, Val loss: 4.6836 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 2300: +Train loss: 4.2645, Val loss: 4.2635 +wikitext-103-v1 - Train loss: 5.4225, Val loss: 5.4096 +ptb - Train loss: 5.3160, Val loss: 5.4128 +lambada - Train loss: 4.6652, Val loss: 4.6709 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 2400: +Train loss: 4.2638, Val loss: 4.2619 +wikitext-103-v1 - Train loss: 5.3724, Val loss: 5.3646 +ptb - Train loss: 5.2937, Val loss: 5.3858 +lambada - Train loss: 4.6680, Val loss: 4.6788 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 2500: +Train loss: 4.2484, Val loss: 4.2368 +wikitext-103-v1 - Train loss: 5.3741, Val loss: 5.3582 +ptb - Train loss: 5.3022, Val loss: 5.4054 +lambada - Train loss: 4.6511, Val loss: 4.6555 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 2600: +Train loss: 4.2299, Val loss: 4.2313 +wikitext-103-v1 - Train loss: 5.3483, Val loss: 5.3359 +ptb - Train loss: 5.2711, Val loss: 5.3650 +lambada - Train loss: 4.6342, Val loss: 4.6368 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 2700: +Train loss: 4.2196, Val loss: 4.2160 +wikitext-103-v1 - Train loss: 5.3275, Val loss: 5.3204 +ptb - Train loss: 5.2493, Val loss: 5.3414 +lambada - Train loss: 4.6338, Val loss: 4.6409 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 2800: +Train loss: 4.2036, Val loss: 4.2071 +wikitext-103-v1 - Train loss: 5.3024, Val loss: 5.2921 +ptb - Train loss: 5.2121, Val loss: 5.3058 +lambada - Train loss: 4.6178, Val loss: 4.6223 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 2900: +Train loss: 4.2015, Val loss: 4.1968 +wikitext-103-v1 - Train loss: 5.3016, Val loss: 5.2896 +ptb - Train loss: 5.2291, Val loss: 5.3271 +lambada - Train loss: 4.6142, Val loss: 4.6126 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 3000: +Train loss: 4.1899, Val loss: 4.1919 +wikitext-103-v1 - Train loss: 5.2997, Val loss: 5.2933 +ptb - Train loss: 5.2269, Val loss: 5.3239 +lambada - Train loss: 4.6206, Val loss: 4.6246 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 3100: +Train loss: 4.1695, Val loss: 4.1767 +wikitext-103-v1 - Train loss: 5.2976, Val loss: 5.2833 +ptb - Train loss: 5.2160, Val loss: 5.3148 +lambada - Train loss: 4.6050, Val loss: 4.6105 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 3200: +Train loss: 4.1696, Val loss: 4.1736 +wikitext-103-v1 - Train loss: 5.3025, Val loss: 5.2927 +ptb - Train loss: 5.2375, Val loss: 5.3327 +lambada - Train loss: 4.6048, Val loss: 4.6111 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 3300: +Train loss: 4.1561, Val loss: 4.1632 +wikitext-103-v1 - Train loss: 5.2881, Val loss: 5.2779 +ptb - Train loss: 5.1944, Val loss: 5.2895 +lambada - Train loss: 4.5781, Val loss: 4.5824 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 3400: +Train loss: 4.1536, Val loss: 4.1554 +wikitext-103-v1 - Train loss: 5.2667, Val loss: 5.2470 +ptb - Train loss: 5.1856, Val loss: 5.2822 +lambada - Train loss: 4.5812, Val loss: 4.5848 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 3500: +Train loss: 4.1422, Val loss: 4.1424 +wikitext-103-v1 - Train loss: 5.2607, Val loss: 5.2540 +ptb - Train loss: 5.2094, Val loss: 5.3028 +lambada - Train loss: 4.5873, Val loss: 4.5914 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 3600: +Train loss: 4.1360, Val loss: 4.1439 +wikitext-103-v1 - Train loss: 5.2692, Val loss: 5.2645 +ptb - Train loss: 5.1743, Val loss: 5.2813 +lambada - Train loss: 4.5695, Val loss: 4.5730 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 3700: +Train loss: 4.1227, Val loss: 4.1239 +wikitext-103-v1 - Train loss: 5.2258, Val loss: 5.2145 +ptb - Train loss: 5.1660, Val loss: 5.2617 +lambada - Train loss: 4.5613, Val loss: 4.5655 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 3800: +Train loss: 4.1182, Val loss: 4.1283 +wikitext-103-v1 - Train loss: 5.2341, Val loss: 5.2154 +ptb - Train loss: 5.1444, Val loss: 5.2508 +lambada - Train loss: 4.5727, Val loss: 4.5716 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 3900: +Train loss: 4.1218, Val loss: 4.1184 +wikitext-103-v1 - Train loss: 5.2341, Val loss: 5.2253 +ptb - Train loss: 5.1820, Val loss: 5.2793 +lambada - Train loss: 4.5684, Val loss: 4.5727 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 4000: +Train loss: 4.1111, Val loss: 4.1120 +wikitext-103-v1 - Train loss: 5.2359, Val loss: 5.2231 +ptb - Train loss: 5.1421, Val loss: 5.2474 +lambada - Train loss: 4.5633, Val loss: 4.5646 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 4100: +Train loss: 4.1032, Val loss: 4.1105 +wikitext-103-v1 - Train loss: 5.2161, Val loss: 5.1982 +ptb - Train loss: 5.1258, Val loss: 5.2248 +lambada - Train loss: 4.5645, Val loss: 4.5642 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 4200: +Train loss: 4.0990, Val loss: 4.1054 +wikitext-103-v1 - Train loss: 5.2165, Val loss: 5.2088 +ptb - Train loss: 5.1368, Val loss: 5.2390 +lambada - Train loss: 4.5588, Val loss: 4.5571 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 4300: +Train loss: 4.0971, Val loss: 4.0953 +wikitext-103-v1 - Train loss: 5.2058, Val loss: 5.2058 +ptb - Train loss: 5.1333, Val loss: 5.2331 +lambada - Train loss: 4.5418, Val loss: 4.5494 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 4400: +Train loss: 4.0878, Val loss: 4.0926 +wikitext-103-v1 - Train loss: 5.2010, Val loss: 5.1881 +ptb - Train loss: 5.1472, Val loss: 5.2448 +lambada - Train loss: 4.5529, Val loss: 4.5534 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 4500: +Train loss: 4.0801, Val loss: 4.0823 +wikitext-103-v1 - Train loss: 5.2099, Val loss: 5.1977 +ptb - Train loss: 5.1275, Val loss: 5.2261 +lambada - Train loss: 4.5502, Val loss: 4.5516 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 4600: +Train loss: 4.0785, Val loss: 4.0809 +wikitext-103-v1 - Train loss: 5.1881, Val loss: 5.1849 +ptb - Train loss: 5.1276, Val loss: 5.2275 +lambada - Train loss: 4.5367, Val loss: 4.5449 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 4700: +Train loss: 4.0759, Val loss: 4.0774 +wikitext-103-v1 - Train loss: 5.1825, Val loss: 5.1676 +ptb - Train loss: 5.1120, Val loss: 5.2090 +lambada - Train loss: 4.5403, Val loss: 4.5417 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 4800: +Train loss: 4.0699, Val loss: 4.0704 +wikitext-103-v1 - Train loss: 5.1745, Val loss: 5.1694 +ptb - Train loss: 5.0925, Val loss: 5.1993 +lambada - Train loss: 4.5430, Val loss: 4.5460 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 4900: +Train loss: 4.0708, Val loss: 4.0723 +wikitext-103-v1 - Train loss: 5.1736, Val loss: 5.1638 +ptb - Train loss: 5.1097, Val loss: 5.2089 +lambada - Train loss: 4.5341, Val loss: 4.5324 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 5000: +Train loss: 4.0738, Val loss: 4.0627 +wikitext-103-v1 - Train loss: 5.1783, Val loss: 5.1683 +ptb - Train loss: 5.1218, Val loss: 5.2211 +lambada - Train loss: 4.5202, Val loss: 4.5271 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 5100: +Train loss: 4.0679, Val loss: 4.0629 +wikitext-103-v1 - Train loss: 5.1478, Val loss: 5.1345 +ptb - Train loss: 5.0906, Val loss: 5.1891 +lambada - Train loss: 4.5357, Val loss: 4.5417 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 5200: +Train loss: 4.0619, Val loss: 4.0602 +wikitext-103-v1 - Train loss: 5.1570, Val loss: 5.1594 +ptb - Train loss: 5.1016, Val loss: 5.2008 +lambada - Train loss: 4.5226, Val loss: 4.5291 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 5300: +Train loss: 4.0544, Val loss: 4.0475 +wikitext-103-v1 - Train loss: 5.1551, Val loss: 5.1481 +ptb - Train loss: 5.0927, Val loss: 5.1866 +lambada - Train loss: 4.5254, Val loss: 4.5286 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 5400: +Train loss: 4.0552, Val loss: 4.0550 +wikitext-103-v1 - Train loss: 5.1494, Val loss: 5.1435 +ptb - Train loss: 5.0959, Val loss: 5.1937 +lambada - Train loss: 4.5124, Val loss: 4.5175 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 5500: +Train loss: 4.0403, Val loss: 4.0419 +wikitext-103-v1 - Train loss: 5.1380, Val loss: 5.1331 +ptb - Train loss: 5.0658, Val loss: 5.1686 +lambada - Train loss: 4.5183, Val loss: 4.5230 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 5600: +Train loss: 4.0374, Val loss: 4.0427 +wikitext-103-v1 - Train loss: 5.1417, Val loss: 5.1273 +ptb - Train loss: 5.0626, Val loss: 5.1609 +lambada - Train loss: 4.5095, Val loss: 4.5146 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 5700: +Train loss: 4.0355, Val loss: 4.0437 +wikitext-103-v1 - Train loss: 5.1518, Val loss: 5.1364 +ptb - Train loss: 5.0862, Val loss: 5.1840 +lambada - Train loss: 4.5158, Val loss: 4.5195 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 5800: +Train loss: 4.0385, Val loss: 4.0403 +wikitext-103-v1 - Train loss: 5.1305, Val loss: 5.1216 +ptb - Train loss: 5.0625, Val loss: 5.1620 +lambada - Train loss: 4.5112, Val loss: 4.5095 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 5900: +Train loss: 4.0364, Val loss: 4.0350 +wikitext-103-v1 - Train loss: 5.1208, Val loss: 5.1312 +ptb - Train loss: 5.0921, Val loss: 5.1881 +lambada - Train loss: 4.5059, Val loss: 4.5069 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 6000: +Train loss: 4.0345, Val loss: 4.0319 +wikitext-103-v1 - Train loss: 5.1132, Val loss: 5.1048 +ptb - Train loss: 5.0719, Val loss: 5.1680 +lambada - Train loss: 4.5075, Val loss: 4.5090 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 6100: +Train loss: 4.0291, Val loss: 4.0302 +wikitext-103-v1 - Train loss: 5.1343, Val loss: 5.1188 +ptb - Train loss: 5.0755, Val loss: 5.1754 +lambada - Train loss: 4.5063, Val loss: 4.5093 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 6200: +Train loss: 4.0298, Val loss: 4.0270 +wikitext-103-v1 - Train loss: 5.1198, Val loss: 5.1135 +ptb - Train loss: 5.0849, Val loss: 5.1837 +lambada - Train loss: 4.5001, Val loss: 4.5057 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 6300: +Train loss: 4.0120, Val loss: 4.0223 +wikitext-103-v1 - Train loss: 5.1073, Val loss: 5.0935 +ptb - Train loss: 5.0636, Val loss: 5.1639 +lambada - Train loss: 4.5023, Val loss: 4.5066 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 6400: +Train loss: 4.0303, Val loss: 4.0206 +wikitext-103-v1 - Train loss: 5.1083, Val loss: 5.0936 +ptb - Train loss: 5.0704, Val loss: 5.1765 +lambada - Train loss: 4.4978, Val loss: 4.5035 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 6500: +Train loss: 4.0186, Val loss: 4.0194 +wikitext-103-v1 - Train loss: 5.1182, Val loss: 5.1039 +ptb - Train loss: 5.0613, Val loss: 5.1594 +lambada - Train loss: 4.4953, Val loss: 4.5011 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 6600: +Train loss: 4.0106, Val loss: 4.0200 +wikitext-103-v1 - Train loss: 5.1055, Val loss: 5.1000 +ptb - Train loss: 5.0517, Val loss: 5.1510 +lambada - Train loss: 4.4959, Val loss: 4.4997 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 6700: +Train loss: 4.0081, Val loss: 4.0183 +wikitext-103-v1 - Train loss: 5.0990, Val loss: 5.0874 +ptb - Train loss: 5.0493, Val loss: 5.1487 +lambada - Train loss: 4.4933, Val loss: 4.4947 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 6800: +Train loss: 4.0132, Val loss: 4.0074 +wikitext-103-v1 - Train loss: 5.1058, Val loss: 5.0888 +ptb - Train loss: 5.0600, Val loss: 5.1576 +lambada - Train loss: 4.4992, Val loss: 4.5044 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 6900: +Train loss: 4.0119, Val loss: 4.0043 +wikitext-103-v1 - Train loss: 5.0862, Val loss: 5.0822 +ptb - Train loss: 5.0430, Val loss: 5.1389 +lambada - Train loss: 4.4929, Val loss: 4.4981 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 7000: +Train loss: 4.0135, Val loss: 4.0095 +wikitext-103-v1 - Train loss: 5.0859, Val loss: 5.0767 +ptb - Train loss: 5.0395, Val loss: 5.1432 +lambada - Train loss: 4.5001, Val loss: 4.4999 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 7100: +Train loss: 4.0043, Val loss: 4.0185 +wikitext-103-v1 - Train loss: 5.0892, Val loss: 5.0722 +ptb - Train loss: 5.0485, Val loss: 5.1471 +lambada - Train loss: 4.4882, Val loss: 4.4902 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 7200: +Train loss: 4.0052, Val loss: 4.0054 +wikitext-103-v1 - Train loss: 5.0781, Val loss: 5.0803 +ptb - Train loss: 5.0392, Val loss: 5.1441 +lambada - Train loss: 4.4882, Val loss: 4.4935 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 7300: +Train loss: 4.0009, Val loss: 4.0039 +wikitext-103-v1 - Train loss: 5.0892, Val loss: 5.0818 +ptb - Train loss: 5.0499, Val loss: 5.1508 +lambada - Train loss: 4.4868, Val loss: 4.4912 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 7400: +Train loss: 3.9974, Val loss: 3.9971 +wikitext-103-v1 - Train loss: 5.0756, Val loss: 5.0686 +ptb - Train loss: 5.0278, Val loss: 5.1275 +lambada - Train loss: 4.4925, Val loss: 4.4914 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 7500: +Train loss: 4.0018, Val loss: 3.9977 +wikitext-103-v1 - Train loss: 5.0840, Val loss: 5.0721 +ptb - Train loss: 5.0323, Val loss: 5.1335 +lambada - Train loss: 4.4840, Val loss: 4.4909 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 7600: +Train loss: 4.0024, Val loss: 4.0007 +wikitext-103-v1 - Train loss: 5.0780, Val loss: 5.0694 +ptb - Train loss: 5.0279, Val loss: 5.1279 +lambada - Train loss: 4.4839, Val loss: 4.4884 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 7700: +Train loss: 3.9975, Val loss: 3.9980 +wikitext-103-v1 - Train loss: 5.0839, Val loss: 5.0680 +ptb - Train loss: 5.0371, Val loss: 5.1319 +lambada - Train loss: 4.4786, Val loss: 4.4843 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 7800: +Train loss: 3.9940, Val loss: 3.9906 +wikitext-103-v1 - Train loss: 5.0748, Val loss: 5.0596 +ptb - Train loss: 5.0201, Val loss: 5.1231 +lambada - Train loss: 4.4859, Val loss: 4.4937 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 7900: +Train loss: 3.9901, Val loss: 3.9896 +wikitext-103-v1 - Train loss: 5.0743, Val loss: 5.0577 +ptb - Train loss: 5.0312, Val loss: 5.1255 +lambada - Train loss: 4.4822, Val loss: 4.4850 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 8000: +Train loss: 3.9915, Val loss: 3.9886 +wikitext-103-v1 - Train loss: 5.0631, Val loss: 5.0614 +ptb - Train loss: 5.0387, Val loss: 5.1314 +lambada - Train loss: 4.4788, Val loss: 4.4850 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 8100: +Train loss: 3.9910, Val loss: 3.9935 +wikitext-103-v1 - Train loss: 5.0712, Val loss: 5.0660 +ptb - Train loss: 5.0312, Val loss: 5.1405 +lambada - Train loss: 4.4793, Val loss: 4.4844 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 8200: +Train loss: 3.9864, Val loss: 3.9880 +wikitext-103-v1 - Train loss: 5.0709, Val loss: 5.0580 +ptb - Train loss: 5.0259, Val loss: 5.1218 +lambada - Train loss: 4.4796, Val loss: 4.4821 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 8300: +Train loss: 3.9835, Val loss: 3.9936 +wikitext-103-v1 - Train loss: 5.0681, Val loss: 5.0561 +ptb - Train loss: 5.0273, Val loss: 5.1259 +lambada - Train loss: 4.4764, Val loss: 4.4822 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 8400: +Train loss: 3.9881, Val loss: 3.9897 +wikitext-103-v1 - Train loss: 5.0534, Val loss: 5.0477 +ptb - Train loss: 5.0190, Val loss: 5.1186 +lambada - Train loss: 4.4750, Val loss: 4.4784 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 8500: +Train loss: 3.9777, Val loss: 3.9918 +wikitext-103-v1 - Train loss: 5.0605, Val loss: 5.0531 +ptb - Train loss: 5.0218, Val loss: 5.1322 +lambada - Train loss: 4.4782, Val loss: 4.4836 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 8600: +Train loss: 3.9806, Val loss: 3.9887 +wikitext-103-v1 - Train loss: 5.0600, Val loss: 5.0480 +ptb - Train loss: 5.0226, Val loss: 5.1188 +lambada - Train loss: 4.4799, Val loss: 4.4809 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 8700: +Train loss: 3.9781, Val loss: 3.9805 +wikitext-103-v1 - Train loss: 5.0556, Val loss: 5.0466 +ptb - Train loss: 5.0239, Val loss: 5.1292 +lambada - Train loss: 4.4781, Val loss: 4.4834 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 8800: +Train loss: 3.9785, Val loss: 3.9802 +wikitext-103-v1 - Train loss: 5.0655, Val loss: 5.0525 +ptb - Train loss: 5.0268, Val loss: 5.1209 +lambada - Train loss: 4.4741, Val loss: 4.4766 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 8900: +Train loss: 3.9833, Val loss: 3.9931 +wikitext-103-v1 - Train loss: 5.0574, Val loss: 5.0533 +ptb - Train loss: 5.0179, Val loss: 5.1188 +lambada - Train loss: 4.4760, Val loss: 4.4813 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 9000: +Train loss: 3.9835, Val loss: 3.9841 +wikitext-103-v1 - Train loss: 5.0689, Val loss: 5.0436 +ptb - Train loss: 5.0171, Val loss: 5.1169 +lambada - Train loss: 4.4742, Val loss: 4.4769 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 9100: +Train loss: 3.9748, Val loss: 3.9785 +wikitext-103-v1 - Train loss: 5.0556, Val loss: 5.0489 +ptb - Train loss: 5.0122, Val loss: 5.1160 +lambada - Train loss: 4.4743, Val loss: 4.4800 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 9200: +Train loss: 3.9654, Val loss: 3.9810 +wikitext-103-v1 - Train loss: 5.0495, Val loss: 5.0474 +ptb - Train loss: 5.0168, Val loss: 5.1189 +lambada - Train loss: 4.4711, Val loss: 4.4751 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 9300: +Train loss: 3.9789, Val loss: 3.9732 +wikitext-103-v1 - Train loss: 5.0582, Val loss: 5.0508 +ptb - Train loss: 5.0140, Val loss: 5.1162 +lambada - Train loss: 4.4699, Val loss: 4.4748 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 9400: +Train loss: 3.9758, Val loss: 3.9833 +wikitext-103-v1 - Train loss: 5.0500, Val loss: 5.0506 +ptb - Train loss: 5.0191, Val loss: 5.1176 +lambada - Train loss: 4.4708, Val loss: 4.4721 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 9500: +Train loss: 3.9783, Val loss: 3.9687 +wikitext-103-v1 - Train loss: 5.0574, Val loss: 5.0463 +ptb - Train loss: 5.0167, Val loss: 5.1181 +lambada - Train loss: 4.4721, Val loss: 4.4790 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 9600: +Train loss: 3.9716, Val loss: 3.9782 +wikitext-103-v1 - Train loss: 5.0629, Val loss: 5.0453 +ptb - Train loss: 5.0238, Val loss: 5.1252 +lambada - Train loss: 4.4728, Val loss: 4.4718 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 9700: +Train loss: 3.9712, Val loss: 3.9751 +wikitext-103-v1 - Train loss: 5.0511, Val loss: 5.0384 +ptb - Train loss: 5.0075, Val loss: 5.1128 +lambada - Train loss: 4.4700, Val loss: 4.4715 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 9800: +Train loss: 3.9700, Val loss: 3.9722 +wikitext-103-v1 - Train loss: 5.0589, Val loss: 5.0406 +ptb - Train loss: 5.0216, Val loss: 5.1208 +lambada - Train loss: 4.4703, Val loss: 4.4750 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 9900: +Train loss: 3.9801, Val loss: 3.9726 +wikitext-103-v1 - Train loss: 5.0536, Val loss: 5.0397 +ptb - Train loss: 5.0144, Val loss: 5.1186 +lambada - Train loss: 4.4761, Val loss: 4.4739 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt + +Step 10000: +Train loss: 3.9734, Val loss: 3.9688 +wikitext-103-v1 - Train loss: 5.0462, Val loss: 5.0392 +ptb - Train loss: 5.0146, Val loss: 5.1124 +lambada - Train loss: 4.4680, Val loss: 4.4717 +Saving checkpoint to out/ckpt_polynomial_legendre_default.pt diff --git a/wandb/run-20240927_005424-60260ulk/logs/debug-core.log b/wandb/run-20240927_005424-60260ulk/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..8e1d5fed8d5c048d10f3ce663c8cec560a7e3445 --- /dev/null +++ b/wandb/run-20240927_005424-60260ulk/logs/debug-core.log @@ -0,0 +1,17 @@ +{"time":"2024-09-26T18:08:13.661085645Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp586bghqd/port-6168.txt","pid":6168,"debug":false,"disable-analytics":false} +{"time":"2024-09-26T18:08:13.66112903Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2024-09-26T18:08:13.661979363Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":6168} +{"time":"2024-09-26T18:08:13.661969217Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":43071,"Zone":""}} +{"time":"2024-09-26T18:08:13.857998352Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:50862"} +{"time":"2024-09-26T18:08:14.29644191Z","level":"INFO","msg":"connection init received","streamId":"1klxtkie","id":"127.0.0.1:50862"} +{"time":"2024-09-26T18:08:14.296870522Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240926_180813.log /root/wandb/run-20240926_180814-1klxtkie/logs/debug-core.log: file exists"} +{"time":"2024-09-26T18:08:14.300104442Z","level":"INFO","msg":"connection init completed","streamId":"1klxtkie","id":"127.0.0.1:50862"} +{"time":"2024-09-27T00:54:24.403773763Z","level":"INFO","msg":"handle finish received","streamId":"1klxtkie","id":"127.0.0.1:50862"} +{"time":"2024-09-27T00:54:24.933689321Z","level":"INFO","msg":"connection init received","streamId":"60260ulk","id":"127.0.0.1:50862"} +{"time":"2024-09-27T00:54:24.934341547Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240926_180813.log /root/wandb/run-20240927_005424-60260ulk/logs/debug-core.log: file exists"} +{"time":"2024-09-27T00:54:24.937048792Z","level":"INFO","msg":"connection init completed","streamId":"60260ulk","id":"127.0.0.1:50862"} +{"time":"2024-09-27T07:40:10.740842849Z","level":"INFO","msg":"handle finish received","streamId":"60260ulk","id":"127.0.0.1:50862"} +{"time":"2024-09-27T07:40:11.365393025Z","level":"INFO","msg":"connection init received","streamId":"gzu8f7wl","id":"127.0.0.1:50862"} +{"time":"2024-09-27T07:40:11.365761867Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240926_180813.log /root/wandb/run-20240927_074011-gzu8f7wl/logs/debug-core.log: file exists"} +{"time":"2024-09-27T07:40:11.368624142Z","level":"INFO","msg":"connection init completed","streamId":"gzu8f7wl","id":"127.0.0.1:50862"} +{"time":"2024-09-27T14:25:58.663233862Z","level":"INFO","msg":"Parent process exited, terminating service process."} diff --git a/wandb/run-20240927_005424-60260ulk/logs/debug.log b/wandb/run-20240927_005424-60260ulk/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..63a5b32a33f095ad80a32cf4de350fff5730e74e --- /dev/null +++ b/wandb/run-20240927_005424-60260ulk/logs/debug.log @@ -0,0 +1,32 @@ +2024-09-27 00:54:24,931 INFO MainThread:6168 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1 +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_setup.py:_flush():77] Configure stats pid to 6168 +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_setup.py:_flush():77] Loading settings from /root/wandb/settings +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_setup.py:_flush():77] Loading settings from environment variables: {} +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None} +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/train.py', 'program': '/root/train.py'} +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_setup.py:_flush():77] Applying login settings: {} +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_init.py:_log_setup():532] Logging user logs to /root/wandb/run-20240927_005424-60260ulk/logs/debug.log +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_init.py:_log_setup():533] Logging internal logs to /root/wandb/run-20240927_005424-60260ulk/logs/debug-internal.log +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_init.py:init():616] calling init triggers +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_init.py:init():623] wandb.init called with sweep_config: {} +config: {'out_dir': 'out', 'eval_interval': 100, 'log_interval': 1, 'eval_iters': 100, 'eval_only': False, 'always_save_checkpoint': True, 'init_from': 'scratch', 'checkpoint_path': '', 'wandb_log': True, 'wandb_project': 'gpt2_positional_encodings_10B', 'wandb_run_name': 'experiment', 'dataset': 'fineweb', 'gradient_accumulation_steps': 40, 'batch_size': 120, 'block_size': 512, 'n_layer': 4, 'n_head': 4, 'n_embd': 256, 'dropout': 0.0, 'bias': False, 'learning_rate': 0.0006, 'max_iters': 10000, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'decay_lr': True, 'warmup_iters': 100, 'lr_decay_iters': 10000, 'min_lr': 6e-05, 'backend': 'nccl', 'device': 'cuda', 'dtype': 'bfloat16', 'compile': True, 'embedding_types': ['sinusoidal', 'polynomial_legendre', 'polynomial_chebyshev', 'random_fourier', 'wavelet'], 'attention_types': ['default'], 'collect_attention_patterns': False, 'collect_activations': False, 'eval_datasets': ['wikitext-103-v1', 'ptb', 'lambada'], 'seed': 1337} +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_init.py:init():666] starting backend +2024-09-27 00:54:24,932 INFO MainThread:6168 [wandb_init.py:init():670] setting up manager +2024-09-27 00:54:24,933 INFO MainThread:6168 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-09-27 00:54:24,933 INFO MainThread:6168 [wandb_init.py:init():678] backend started and connected +2024-09-27 00:54:24,937 INFO MainThread:6168 [wandb_init.py:init():773] updated telemetry +2024-09-27 00:54:24,937 INFO MainThread:6168 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout +2024-09-27 00:54:26,600 INFO MainThread:6168 [wandb_init.py:init():857] starting run threads in backend +2024-09-27 00:54:26,731 INFO MainThread:6168 [wandb_run.py:_console_start():2459] atexit reg +2024-09-27 00:54:26,731 INFO MainThread:6168 [wandb_run.py:_redirect():2307] redirect: wrap_raw +2024-09-27 00:54:26,731 INFO MainThread:6168 [wandb_run.py:_redirect():2372] Wrapping output streams. +2024-09-27 00:54:26,731 INFO MainThread:6168 [wandb_run.py:_redirect():2397] Redirects installed. +2024-09-27 00:54:26,732 INFO MainThread:6168 [wandb_init.py:init():900] run started, returning control to user process +2024-09-27 07:40:08,167 INFO MainThread:6168 [wandb_run.py:_finish():2158] finishing run tulasiram/gpt2_positional_encodings_10B/60260ulk +2024-09-27 07:40:08,167 INFO MainThread:6168 [wandb_run.py:_atexit_cleanup():2422] got exitcode: 0 +2024-09-27 07:40:08,168 INFO MainThread:6168 [wandb_run.py:_restore():2404] restore +2024-09-27 07:40:08,168 INFO MainThread:6168 [wandb_run.py:_restore():2410] restore done +2024-09-27 07:40:10,726 INFO MainThread:6168 [wandb_run.py:_footer_history_summary_info():4037] rendering history +2024-09-27 07:40:10,729 INFO MainThread:6168 [wandb_run.py:_footer_history_summary_info():4069] rendering summary +2024-09-27 07:40:10,740 INFO MainThread:6168 [wandb_run.py:_footer_sync_info():3996] logging synced files diff --git a/wandb/run-20240927_074011-gzu8f7wl/logs/debug-internal.log b/wandb/run-20240927_074011-gzu8f7wl/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..149b3f1429244311245730c5ccc9c3ab3cc4c4c4 --- /dev/null +++ b/wandb/run-20240927_074011-gzu8f7wl/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-09-27T07:40:11.365685032Z","level":"INFO","msg":"using version","core version":"0.18.1"} +{"time":"2024-09-27T07:40:11.36572336Z","level":"INFO","msg":"created symlink","path":"/root/wandb/run-20240927_074011-gzu8f7wl/logs/debug-core.log"} +{"time":"2024-09-27T07:40:11.365800956Z","level":"INFO","msg":"using version","core version":"0.18.1"} +{"time":"2024-09-27T07:40:11.365811922Z","level":"INFO","msg":"created symlink","path":"/root/wandb/run-20240927_074011-gzu8f7wl/logs/debug-core.log"} +{"time":"2024-09-27T07:40:11.368570572Z","level":"INFO","msg":"created new stream","id":"gzu8f7wl"} +{"time":"2024-09-27T07:40:11.368612534Z","level":"INFO","msg":"stream: started","id":"gzu8f7wl"} +{"time":"2024-09-27T07:40:11.368650852Z","level":"INFO","msg":"handler: started","stream_id":{"value":"gzu8f7wl"}} +{"time":"2024-09-27T07:40:11.368664782Z","level":"INFO","msg":"sender: started","stream_id":{"value":"gzu8f7wl"}} +{"time":"2024-09-27T07:40:11.368691442Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"gzu8f7wl"}} +{"time":"2024-09-27T07:40:11.893705861Z","level":"INFO","msg":"wandb-core","!BADKEY":null} +{"time":"2024-09-27T07:40:11.894243206Z","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-09-27T14:25:56.005687445Z","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-09-27T14:25:56.006006112Z","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-09-27T14:25:56.457453089Z","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2024-09-27T14:25:56.457480099Z","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2024-09-27T14:25:56.457490264Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}