ffmpeg-cuda / libavfilter /vf_bwdif_cuda.ptx
camenduru's picture
thanks to ffmpeg ❤
8ead80b
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-31833905
// Cuda compilation tools, release 11.8, V11.8.89
// Based on NVVM 7.0.1
//
.version 7.8
.target sm_60
.address_size 64
// .globl bwdif_uchar
.global .align 4 .b8 coef_lf[8] = {213, 16, 0, 0, 213, 0, 0, 0};
.global .align 4 .b8 coef_hf[12] = {194, 21, 0, 0, 217, 14, 0, 0, 248, 3, 0, 0};
.global .align 4 .b8 coef_sp[8] = {213, 19, 0, 0, 213, 3, 0, 0};
.visible .entry bwdif_uchar(
.param .u64 bwdif_uchar_param_0,
.param .u64 bwdif_uchar_param_1,
.param .u64 bwdif_uchar_param_2,
.param .u64 bwdif_uchar_param_3,
.param .u32 bwdif_uchar_param_4,
.param .u32 bwdif_uchar_param_5,
.param .u32 bwdif_uchar_param_6,
.param .u32 bwdif_uchar_param_7,
.param .u32 bwdif_uchar_param_8,
.param .u32 bwdif_uchar_param_9,
.param .u32 bwdif_uchar_param_10,
.param .u32 bwdif_uchar_param_11,
.param .u32 bwdif_uchar_param_12
)
{
.reg .pred %p<12>;
.reg .f32 %f<12>;
.reg .b32 %r<191>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [bwdif_uchar_param_0];
ld.param.u64 %rd3, [bwdif_uchar_param_1];
ld.param.u64 %rd4, [bwdif_uchar_param_2];
ld.param.u64 %rd5, [bwdif_uchar_param_3];
ld.param.u32 %r54, [bwdif_uchar_param_4];
ld.param.u32 %r55, [bwdif_uchar_param_5];
ld.param.u32 %r49, [bwdif_uchar_param_6];
ld.param.u32 %r50, [bwdif_uchar_param_9];
ld.param.u32 %r51, [bwdif_uchar_param_10];
ld.param.u32 %r52, [bwdif_uchar_param_11];
ld.param.u32 %r53, [bwdif_uchar_param_12];
mov.u32 %r56, %ctaid.x;
mov.u32 %r57, %ntid.x;
mov.u32 %r58, %tid.x;
mad.lo.s32 %r1, %r56, %r57, %r58;
mov.u32 %r59, %ntid.y;
mov.u32 %r60, %ctaid.y;
mov.u32 %r61, %tid.y;
mad.lo.s32 %r2, %r60, %r59, %r61;
setp.ge.s32 %p1, %r1, %r54;
setp.ge.s32 %p2, %r2, %r55;
or.pred %p3, %p1, %p2;
@%p3 bra $L__BB0_11;
cvta.to.global.u64 %rd6, %rd2;
shr.u32 %r62, %r2, 31;
add.s32 %r63, %r2, %r62;
and.b32 %r64, %r63, -2;
sub.s32 %r65, %r2, %r64;
setp.eq.s32 %p4, %r65, %r50;
cvt.rn.f32.s32 %f1, %r1;
mad.lo.s32 %r66, %r2, %r49, %r1;
cvt.s64.s32 %rd7, %r66;
add.s64 %rd1, %rd6, %rd7;
@%p4 bra $L__BB0_10;
bra.uni $L__BB0_2;
$L__BB0_10:
cvt.rn.f32.s32 %f11, %r2;
tex.2d.v4.u32.f32 {%r185, %r186, %r187, %r188}, [%rd4, {%f1, %f11}];
st.global.u8 [%rd1], %r185;
bra.uni $L__BB0_11;
$L__BB0_2:
add.s32 %r67, %r2, 3;
cvt.rn.f32.s32 %f4, %r67;
tex.2d.v4.u32.f32 {%r3, %r68, %r69, %r70}, [%rd4, {%f1, %f4}];
add.s32 %r71, %r2, 1;
cvt.rn.f32.s32 %f2, %r71;
tex.2d.v4.u32.f32 {%r72, %r73, %r74, %r75}, [%rd4, {%f1, %f2}];
add.s32 %r76, %r2, -1;
cvt.rn.f32.s32 %f3, %r76;
tex.2d.v4.u32.f32 {%r77, %r78, %r79, %r80}, [%rd4, {%f1, %f3}];
add.s32 %r81, %r2, -3;
cvt.rn.f32.s32 %f5, %r81;
tex.2d.v4.u32.f32 {%r4, %r82, %r83, %r84}, [%rd4, {%f1, %f5}];
and.b32 %r5, %r77, 255;
and.b32 %r6, %r72, 255;
setp.eq.s32 %p5, %r52, 0;
@%p5 bra $L__BB0_4;
add.s32 %r85, %r5, %r6;
mul.lo.s32 %r86, %r85, 5077;
and.b32 %r87, %r3, 255;
and.b32 %r88, %r4, 255;
add.s32 %r89, %r88, %r87;
mad.lo.s32 %r90, %r89, -981, %r86;
shr.s32 %r91, %r90, 13;
setp.lt.s32 %p6, %r90, 0;
min.s32 %r92, %r91, %r53;
selp.b32 %r93, 0, %r92, %p6;
st.global.u8 [%rd1], %r93;
bra.uni $L__BB0_11;
$L__BB0_4:
setp.eq.s32 %p7, %r50, %r51;
selp.b64 %rd8, %rd4, %rd3, %p7;
selp.b64 %rd9, %rd5, %rd4, %p7;
add.s32 %r94, %r2, 4;
cvt.rn.f32.s32 %f6, %r94;
tex.2d.v4.u32.f32 {%r7, %r8, %r9, %r10}, [%rd3, {%f1, %f6}];
add.s32 %r95, %r2, 2;
cvt.rn.f32.s32 %f7, %r95;
tex.2d.v4.u32.f32 {%r11, %r12, %r13, %r14}, [%rd3, {%f1, %f7}];
cvt.rn.f32.s32 %f8, %r2;
tex.2d.v4.u32.f32 {%r96, %r97, %r98, %r99}, [%rd3, {%f1, %f8}];
add.s32 %r100, %r2, -2;
cvt.rn.f32.s32 %f9, %r100;
tex.2d.v4.u32.f32 {%r15, %r16, %r17, %r18}, [%rd3, {%f1, %f9}];
add.s32 %r101, %r2, -4;
cvt.rn.f32.s32 %f10, %r101;
tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [%rd3, {%f1, %f10}];
tex.2d.v4.u32.f32 {%r102, %r103, %r104, %r105}, [%rd8, {%f1, %f2}];
tex.2d.v4.u32.f32 {%r106, %r107, %r108, %r109}, [%rd8, {%f1, %f3}];
tex.2d.v4.u32.f32 {%r110, %r111, %r112, %r113}, [%rd9, {%f1, %f2}];
tex.2d.v4.u32.f32 {%r114, %r115, %r116, %r117}, [%rd9, {%f1, %f3}];
tex.2d.v4.u32.f32 {%r23, %r24, %r25, %r26}, [%rd5, {%f1, %f7}];
tex.2d.v4.u32.f32 {%r118, %r119, %r120, %r121}, [%rd5, {%f1, %f8}];
tex.2d.v4.u32.f32 {%r27, %r28, %r29, %r30}, [%rd5, {%f1, %f9}];
tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [%rd5, {%f1, %f10}];
and.b32 %r122, %r96, 255;
and.b32 %r123, %r118, 255;
add.s32 %r35, %r123, %r122;
shr.u32 %r190, %r35, 1;
sub.s32 %r124, %r122, %r123;
abs.s32 %r37, %r124;
and.b32 %r125, %r106, 255;
sub.s32 %r126, %r125, %r5;
abs.s32 %r127, %r126;
and.b32 %r128, %r102, 255;
sub.s32 %r129, %r128, %r6;
abs.s32 %r130, %r129;
add.s32 %r131, %r127, %r130;
shr.u32 %r132, %r131, 1;
and.b32 %r133, %r114, 255;
sub.s32 %r134, %r133, %r5;
abs.s32 %r135, %r134;
and.b32 %r136, %r110, 255;
sub.s32 %r137, %r136, %r6;
abs.s32 %r138, %r137;
add.s32 %r139, %r135, %r138;
shr.u32 %r140, %r139, 1;
shr.u32 %r141, %r37, 1;
max.u32 %r142, %r141, %r132;
max.s32 %r38, %r142, %r140;
setp.eq.s32 %p8, %r38, 0;
@%p8 bra $L__BB0_9;
and.b32 %r143, %r15, 255;
and.b32 %r144, %r27, 255;
add.s32 %r39, %r144, %r143;
shr.u32 %r145, %r39, 1;
sub.s32 %r146, %r145, %r5;
and.b32 %r147, %r11, 255;
and.b32 %r148, %r23, 255;
add.s32 %r40, %r148, %r147;
shr.u32 %r149, %r40, 1;
sub.s32 %r150, %r149, %r6;
min.s32 %r151, %r146, %r150;
sub.s32 %r152, %r190, %r6;
sub.s32 %r153, %r190, %r5;
max.s32 %r154, %r152, %r153;
max.s32 %r155, %r154, %r151;
max.s32 %r156, %r146, %r150;
min.s32 %r157, %r152, %r153;
min.s32 %r158, %r157, %r156;
neg.s32 %r159, %r155;
max.s32 %r160, %r38, %r158;
max.s32 %r41, %r160, %r159;
sub.s32 %r161, %r5, %r6;
abs.s32 %r162, %r161;
setp.gt.s32 %p9, %r162, %r37;
add.s32 %r42, %r5, %r6;
and.b32 %r163, %r3, 255;
and.b32 %r164, %r4, 255;
add.s32 %r43, %r164, %r163;
@%p9 bra $L__BB0_7;
bra.uni $L__BB0_6;
$L__BB0_7:
mul.lo.s32 %r166, %r35, 5570;
add.s32 %r167, %r39, %r40;
mad.lo.s32 %r168, %r167, -3801, %r166;
and.b32 %r169, %r7, 255;
and.b32 %r170, %r19, 255;
add.s32 %r171, %r170, %r169;
and.b32 %r172, %r31, 255;
add.s32 %r173, %r171, %r172;
add.s32 %r174, %r173, %r172;
mad.lo.s32 %r175, %r174, 1016, %r168;
shr.s32 %r176, %r175, 2;
mul.lo.s32 %r177, %r42, 4309;
mad.lo.s32 %r178, %r43, -213, %r177;
add.s32 %r189, %r178, %r176;
bra.uni $L__BB0_8;
$L__BB0_6:
mul.lo.s32 %r165, %r42, 5077;
mad.lo.s32 %r189, %r43, -981, %r165;
$L__BB0_8:
add.s32 %r179, %r41, %r190;
shr.s32 %r180, %r189, 13;
setp.gt.s32 %p10, %r180, %r179;
sub.s32 %r181, %r190, %r41;
max.s32 %r182, %r180, %r181;
selp.b32 %r183, %r179, %r182, %p10;
setp.lt.s32 %p11, %r183, 0;
min.s32 %r184, %r183, %r53;
selp.b32 %r190, 0, %r184, %p11;
$L__BB0_9:
st.global.u8 [%rd1], %r190;
$L__BB0_11:
ret;
}
// .globl bwdif_ushort
.visible .entry bwdif_ushort(
.param .u64 bwdif_ushort_param_0,
.param .u64 bwdif_ushort_param_1,
.param .u64 bwdif_ushort_param_2,
.param .u64 bwdif_ushort_param_3,
.param .u32 bwdif_ushort_param_4,
.param .u32 bwdif_ushort_param_5,
.param .u32 bwdif_ushort_param_6,
.param .u32 bwdif_ushort_param_7,
.param .u32 bwdif_ushort_param_8,
.param .u32 bwdif_ushort_param_9,
.param .u32 bwdif_ushort_param_10,
.param .u32 bwdif_ushort_param_11,
.param .u32 bwdif_ushort_param_12
)
{
.reg .pred %p<12>;
.reg .f32 %f<12>;
.reg .b32 %r<191>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [bwdif_ushort_param_0];
ld.param.u64 %rd3, [bwdif_ushort_param_1];
ld.param.u64 %rd4, [bwdif_ushort_param_2];
ld.param.u64 %rd5, [bwdif_ushort_param_3];
ld.param.u32 %r54, [bwdif_ushort_param_4];
ld.param.u32 %r55, [bwdif_ushort_param_5];
ld.param.u32 %r49, [bwdif_ushort_param_6];
ld.param.u32 %r50, [bwdif_ushort_param_9];
ld.param.u32 %r51, [bwdif_ushort_param_10];
ld.param.u32 %r52, [bwdif_ushort_param_11];
ld.param.u32 %r53, [bwdif_ushort_param_12];
mov.u32 %r56, %ctaid.x;
mov.u32 %r57, %ntid.x;
mov.u32 %r58, %tid.x;
mad.lo.s32 %r1, %r56, %r57, %r58;
mov.u32 %r59, %ntid.y;
mov.u32 %r60, %ctaid.y;
mov.u32 %r61, %tid.y;
mad.lo.s32 %r2, %r60, %r59, %r61;
setp.ge.s32 %p1, %r1, %r54;
setp.ge.s32 %p2, %r2, %r55;
or.pred %p3, %p1, %p2;
@%p3 bra $L__BB1_11;
cvta.to.global.u64 %rd6, %rd2;
shr.u32 %r62, %r2, 31;
add.s32 %r63, %r2, %r62;
and.b32 %r64, %r63, -2;
sub.s32 %r65, %r2, %r64;
setp.eq.s32 %p4, %r65, %r50;
cvt.rn.f32.s32 %f1, %r1;
mad.lo.s32 %r66, %r2, %r49, %r1;
mul.wide.s32 %rd7, %r66, 2;
add.s64 %rd1, %rd6, %rd7;
@%p4 bra $L__BB1_10;
bra.uni $L__BB1_2;
$L__BB1_10:
cvt.rn.f32.s32 %f11, %r2;
tex.2d.v4.u32.f32 {%r185, %r186, %r187, %r188}, [%rd4, {%f1, %f11}];
st.global.u16 [%rd1], %r185;
bra.uni $L__BB1_11;
$L__BB1_2:
add.s32 %r67, %r2, 3;
cvt.rn.f32.s32 %f4, %r67;
tex.2d.v4.u32.f32 {%r3, %r68, %r69, %r70}, [%rd4, {%f1, %f4}];
add.s32 %r71, %r2, 1;
cvt.rn.f32.s32 %f2, %r71;
tex.2d.v4.u32.f32 {%r72, %r73, %r74, %r75}, [%rd4, {%f1, %f2}];
add.s32 %r76, %r2, -1;
cvt.rn.f32.s32 %f3, %r76;
tex.2d.v4.u32.f32 {%r77, %r78, %r79, %r80}, [%rd4, {%f1, %f3}];
add.s32 %r81, %r2, -3;
cvt.rn.f32.s32 %f5, %r81;
tex.2d.v4.u32.f32 {%r4, %r82, %r83, %r84}, [%rd4, {%f1, %f5}];
and.b32 %r5, %r77, 65535;
and.b32 %r6, %r72, 65535;
setp.eq.s32 %p5, %r52, 0;
@%p5 bra $L__BB1_4;
add.s32 %r85, %r5, %r6;
mul.lo.s32 %r86, %r85, 5077;
and.b32 %r87, %r3, 65535;
and.b32 %r88, %r4, 65535;
add.s32 %r89, %r88, %r87;
mad.lo.s32 %r90, %r89, -981, %r86;
shr.s32 %r91, %r90, 13;
setp.lt.s32 %p6, %r90, 0;
min.s32 %r92, %r91, %r53;
selp.b32 %r93, 0, %r92, %p6;
st.global.u16 [%rd1], %r93;
bra.uni $L__BB1_11;
$L__BB1_4:
setp.eq.s32 %p7, %r50, %r51;
selp.b64 %rd8, %rd4, %rd3, %p7;
selp.b64 %rd9, %rd5, %rd4, %p7;
add.s32 %r94, %r2, 4;
cvt.rn.f32.s32 %f6, %r94;
tex.2d.v4.u32.f32 {%r7, %r8, %r9, %r10}, [%rd3, {%f1, %f6}];
add.s32 %r95, %r2, 2;
cvt.rn.f32.s32 %f7, %r95;
tex.2d.v4.u32.f32 {%r11, %r12, %r13, %r14}, [%rd3, {%f1, %f7}];
cvt.rn.f32.s32 %f8, %r2;
tex.2d.v4.u32.f32 {%r96, %r97, %r98, %r99}, [%rd3, {%f1, %f8}];
add.s32 %r100, %r2, -2;
cvt.rn.f32.s32 %f9, %r100;
tex.2d.v4.u32.f32 {%r15, %r16, %r17, %r18}, [%rd3, {%f1, %f9}];
add.s32 %r101, %r2, -4;
cvt.rn.f32.s32 %f10, %r101;
tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [%rd3, {%f1, %f10}];
tex.2d.v4.u32.f32 {%r102, %r103, %r104, %r105}, [%rd8, {%f1, %f2}];
tex.2d.v4.u32.f32 {%r106, %r107, %r108, %r109}, [%rd8, {%f1, %f3}];
tex.2d.v4.u32.f32 {%r110, %r111, %r112, %r113}, [%rd9, {%f1, %f2}];
tex.2d.v4.u32.f32 {%r114, %r115, %r116, %r117}, [%rd9, {%f1, %f3}];
tex.2d.v4.u32.f32 {%r23, %r24, %r25, %r26}, [%rd5, {%f1, %f7}];
tex.2d.v4.u32.f32 {%r118, %r119, %r120, %r121}, [%rd5, {%f1, %f8}];
tex.2d.v4.u32.f32 {%r27, %r28, %r29, %r30}, [%rd5, {%f1, %f9}];
tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [%rd5, {%f1, %f10}];
and.b32 %r122, %r96, 65535;
and.b32 %r123, %r118, 65535;
add.s32 %r35, %r123, %r122;
shr.u32 %r190, %r35, 1;
sub.s32 %r124, %r122, %r123;
abs.s32 %r37, %r124;
and.b32 %r125, %r106, 65535;
sub.s32 %r126, %r125, %r5;
abs.s32 %r127, %r126;
and.b32 %r128, %r102, 65535;
sub.s32 %r129, %r128, %r6;
abs.s32 %r130, %r129;
add.s32 %r131, %r127, %r130;
shr.u32 %r132, %r131, 1;
and.b32 %r133, %r114, 65535;
sub.s32 %r134, %r133, %r5;
abs.s32 %r135, %r134;
and.b32 %r136, %r110, 65535;
sub.s32 %r137, %r136, %r6;
abs.s32 %r138, %r137;
add.s32 %r139, %r135, %r138;
shr.u32 %r140, %r139, 1;
shr.u32 %r141, %r37, 1;
max.u32 %r142, %r141, %r132;
max.s32 %r38, %r142, %r140;
setp.eq.s32 %p8, %r38, 0;
@%p8 bra $L__BB1_9;
and.b32 %r143, %r15, 65535;
and.b32 %r144, %r27, 65535;
add.s32 %r39, %r144, %r143;
shr.u32 %r145, %r39, 1;
sub.s32 %r146, %r145, %r5;
and.b32 %r147, %r11, 65535;
and.b32 %r148, %r23, 65535;
add.s32 %r40, %r148, %r147;
shr.u32 %r149, %r40, 1;
sub.s32 %r150, %r149, %r6;
min.s32 %r151, %r146, %r150;
sub.s32 %r152, %r190, %r6;
sub.s32 %r153, %r190, %r5;
max.s32 %r154, %r152, %r153;
max.s32 %r155, %r154, %r151;
max.s32 %r156, %r146, %r150;
min.s32 %r157, %r152, %r153;
min.s32 %r158, %r157, %r156;
neg.s32 %r159, %r155;
max.s32 %r160, %r38, %r158;
max.s32 %r41, %r160, %r159;
sub.s32 %r161, %r5, %r6;
abs.s32 %r162, %r161;
setp.gt.s32 %p9, %r162, %r37;
add.s32 %r42, %r5, %r6;
and.b32 %r163, %r3, 65535;
and.b32 %r164, %r4, 65535;
add.s32 %r43, %r164, %r163;
@%p9 bra $L__BB1_7;
bra.uni $L__BB1_6;
$L__BB1_7:
mul.lo.s32 %r166, %r35, 5570;
add.s32 %r167, %r39, %r40;
mad.lo.s32 %r168, %r167, -3801, %r166;
and.b32 %r169, %r7, 65535;
and.b32 %r170, %r19, 65535;
add.s32 %r171, %r170, %r169;
and.b32 %r172, %r31, 65535;
add.s32 %r173, %r171, %r172;
add.s32 %r174, %r173, %r172;
mad.lo.s32 %r175, %r174, 1016, %r168;
shr.s32 %r176, %r175, 2;
mul.lo.s32 %r177, %r42, 4309;
mad.lo.s32 %r178, %r43, -213, %r177;
add.s32 %r189, %r178, %r176;
bra.uni $L__BB1_8;
$L__BB1_6:
mul.lo.s32 %r165, %r42, 5077;
mad.lo.s32 %r189, %r43, -981, %r165;
$L__BB1_8:
add.s32 %r179, %r41, %r190;
shr.s32 %r180, %r189, 13;
setp.gt.s32 %p10, %r180, %r179;
sub.s32 %r181, %r190, %r41;
max.s32 %r182, %r180, %r181;
selp.b32 %r183, %r179, %r182, %p10;
setp.lt.s32 %p11, %r183, 0;
min.s32 %r184, %r183, %r53;
selp.b32 %r190, 0, %r184, %p11;
$L__BB1_9:
st.global.u16 [%rd1], %r190;
$L__BB1_11:
ret;
}
// .globl bwdif_uchar2
.visible .entry bwdif_uchar2(
.param .u64 bwdif_uchar2_param_0,
.param .u64 bwdif_uchar2_param_1,
.param .u64 bwdif_uchar2_param_2,
.param .u64 bwdif_uchar2_param_3,
.param .u32 bwdif_uchar2_param_4,
.param .u32 bwdif_uchar2_param_5,
.param .u32 bwdif_uchar2_param_6,
.param .u32 bwdif_uchar2_param_7,
.param .u32 bwdif_uchar2_param_8,
.param .u32 bwdif_uchar2_param_9,
.param .u32 bwdif_uchar2_param_10,
.param .u32 bwdif_uchar2_param_11,
.param .u32 bwdif_uchar2_param_12
)
{
.reg .pred %p<17>;
.reg .b16 %rs<7>;
.reg .f32 %f<12>;
.reg .b32 %r<283>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [bwdif_uchar2_param_0];
ld.param.u64 %rd3, [bwdif_uchar2_param_1];
ld.param.u64 %rd4, [bwdif_uchar2_param_2];
ld.param.u64 %rd5, [bwdif_uchar2_param_3];
ld.param.u32 %r98, [bwdif_uchar2_param_4];
ld.param.u32 %r99, [bwdif_uchar2_param_5];
ld.param.u32 %r93, [bwdif_uchar2_param_6];
ld.param.u32 %r94, [bwdif_uchar2_param_9];
ld.param.u32 %r95, [bwdif_uchar2_param_10];
ld.param.u32 %r96, [bwdif_uchar2_param_11];
ld.param.u32 %r97, [bwdif_uchar2_param_12];
mov.u32 %r100, %ctaid.x;
mov.u32 %r101, %ntid.x;
mov.u32 %r102, %tid.x;
mad.lo.s32 %r1, %r100, %r101, %r102;
mov.u32 %r103, %ntid.y;
mov.u32 %r104, %ctaid.y;
mov.u32 %r105, %tid.y;
mad.lo.s32 %r2, %r104, %r103, %r105;
setp.ge.s32 %p1, %r1, %r98;
setp.ge.s32 %p2, %r2, %r99;
or.pred %p3, %p1, %p2;
@%p3 bra $L__BB2_16;
cvta.to.global.u64 %rd6, %rd2;
shr.u32 %r106, %r2, 31;
add.s32 %r107, %r2, %r106;
and.b32 %r108, %r107, -2;
sub.s32 %r109, %r2, %r108;
setp.eq.s32 %p4, %r109, %r94;
mad.lo.s32 %r110, %r2, %r93, %r1;
mul.wide.s32 %rd7, %r110, 2;
add.s64 %rd1, %rd6, %rd7;
cvt.rn.f32.s32 %f1, %r1;
@%p4 bra $L__BB2_15;
bra.uni $L__BB2_2;
$L__BB2_15:
cvt.rn.f32.s32 %f11, %r2;
tex.2d.v4.u32.f32 {%r275, %r276, %r277, %r278}, [%rd4, {%f1, %f11}];
cvt.u16.u32 %rs5, %r276;
cvt.u16.u32 %rs6, %r275;
st.global.v2.u8 [%rd1], {%rs6, %rs5};
bra.uni $L__BB2_16;
$L__BB2_2:
add.s32 %r111, %r2, 3;
cvt.rn.f32.s32 %f4, %r111;
tex.2d.v4.u32.f32 {%r3, %r4, %r112, %r113}, [%rd4, {%f1, %f4}];
add.s32 %r114, %r2, 1;
cvt.rn.f32.s32 %f2, %r114;
tex.2d.v4.u32.f32 {%r115, %r5, %r116, %r117}, [%rd4, {%f1, %f2}];
add.s32 %r118, %r2, -1;
cvt.rn.f32.s32 %f3, %r118;
tex.2d.v4.u32.f32 {%r119, %r6, %r120, %r121}, [%rd4, {%f1, %f3}];
add.s32 %r122, %r2, -3;
cvt.rn.f32.s32 %f5, %r122;
tex.2d.v4.u32.f32 {%r7, %r8, %r123, %r124}, [%rd4, {%f1, %f5}];
and.b32 %r9, %r119, 255;
and.b32 %r10, %r115, 255;
setp.eq.s32 %p5, %r96, 0;
@%p5 bra $L__BB2_4;
add.s32 %r125, %r9, %r10;
mul.lo.s32 %r126, %r125, 5077;
and.b32 %r127, %r3, 255;
and.b32 %r128, %r7, 255;
add.s32 %r129, %r128, %r127;
mad.lo.s32 %r130, %r129, -981, %r126;
shr.s32 %r131, %r130, 13;
setp.lt.s32 %p6, %r130, 0;
min.s32 %r132, %r131, %r97;
selp.b32 %r133, 0, %r132, %p6;
and.b32 %r134, %r5, 255;
and.b32 %r135, %r6, 255;
add.s32 %r136, %r135, %r134;
mul.lo.s32 %r137, %r136, 5077;
and.b32 %r138, %r4, 255;
and.b32 %r139, %r8, 255;
add.s32 %r140, %r139, %r138;
mad.lo.s32 %r141, %r140, -981, %r137;
shr.s32 %r142, %r141, 13;
setp.lt.s32 %p7, %r141, 0;
min.s32 %r143, %r142, %r97;
selp.b32 %r144, 0, %r143, %p7;
cvt.u16.u32 %rs1, %r133;
cvt.u16.u32 %rs2, %r144;
st.global.v2.u8 [%rd1], {%rs1, %rs2};
bra.uni $L__BB2_16;
$L__BB2_4:
setp.eq.s32 %p8, %r94, %r95;
selp.b64 %rd8, %rd4, %rd3, %p8;
selp.b64 %rd9, %rd5, %rd4, %p8;
add.s32 %r145, %r2, 4;
cvt.rn.f32.s32 %f6, %r145;
tex.2d.v4.u32.f32 {%r11, %r12, %r13, %r14}, [%rd3, {%f1, %f6}];
add.s32 %r146, %r2, 2;
cvt.rn.f32.s32 %f7, %r146;
tex.2d.v4.u32.f32 {%r15, %r16, %r17, %r18}, [%rd3, {%f1, %f7}];
cvt.rn.f32.s32 %f8, %r2;
tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [%rd3, {%f1, %f8}];
add.s32 %r147, %r2, -2;
cvt.rn.f32.s32 %f9, %r147;
tex.2d.v4.u32.f32 {%r23, %r24, %r25, %r26}, [%rd3, {%f1, %f9}];
add.s32 %r148, %r2, -4;
cvt.rn.f32.s32 %f10, %r148;
tex.2d.v4.u32.f32 {%r27, %r28, %r29, %r30}, [%rd3, {%f1, %f10}];
tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [%rd8, {%f1, %f2}];
tex.2d.v4.u32.f32 {%r35, %r36, %r37, %r38}, [%rd8, {%f1, %f3}];
tex.2d.v4.u32.f32 {%r39, %r40, %r41, %r42}, [%rd9, {%f1, %f2}];
tex.2d.v4.u32.f32 {%r43, %r44, %r45, %r46}, [%rd9, {%f1, %f3}];
tex.2d.v4.u32.f32 {%r47, %r48, %r49, %r50}, [%rd5, {%f1, %f7}];
tex.2d.v4.u32.f32 {%r51, %r52, %r53, %r54}, [%rd5, {%f1, %f8}];
tex.2d.v4.u32.f32 {%r55, %r56, %r57, %r58}, [%rd5, {%f1, %f9}];
tex.2d.v4.u32.f32 {%r59, %r60, %r61, %r62}, [%rd5, {%f1, %f10}];
and.b32 %r149, %r19, 255;
and.b32 %r150, %r51, 255;
add.s32 %r63, %r150, %r149;
shr.u32 %r280, %r63, 1;
sub.s32 %r151, %r149, %r150;
abs.s32 %r65, %r151;
and.b32 %r152, %r35, 255;
sub.s32 %r153, %r152, %r9;
abs.s32 %r154, %r153;
and.b32 %r155, %r31, 255;
sub.s32 %r156, %r155, %r10;
abs.s32 %r157, %r156;
add.s32 %r158, %r154, %r157;
shr.u32 %r159, %r158, 1;
and.b32 %r160, %r43, 255;
sub.s32 %r161, %r160, %r9;
abs.s32 %r162, %r161;
and.b32 %r163, %r39, 255;
sub.s32 %r164, %r163, %r10;
abs.s32 %r165, %r164;
add.s32 %r166, %r162, %r165;
shr.u32 %r167, %r166, 1;
shr.u32 %r168, %r65, 1;
max.u32 %r169, %r168, %r159;
max.s32 %r66, %r169, %r167;
setp.eq.s32 %p9, %r66, 0;
@%p9 bra $L__BB2_9;
and.b32 %r170, %r23, 255;
and.b32 %r171, %r55, 255;
add.s32 %r67, %r171, %r170;
shr.u32 %r172, %r67, 1;
sub.s32 %r173, %r172, %r9;
and.b32 %r174, %r15, 255;
and.b32 %r175, %r47, 255;
add.s32 %r68, %r175, %r174;
shr.u32 %r176, %r68, 1;
sub.s32 %r177, %r176, %r10;
min.s32 %r178, %r173, %r177;
sub.s32 %r179, %r280, %r10;
sub.s32 %r180, %r280, %r9;
max.s32 %r181, %r179, %r180;
max.s32 %r182, %r181, %r178;
max.s32 %r183, %r173, %r177;
min.s32 %r184, %r179, %r180;
min.s32 %r185, %r184, %r183;
neg.s32 %r186, %r182;
max.s32 %r187, %r66, %r185;
max.s32 %r69, %r187, %r186;
sub.s32 %r188, %r9, %r10;
abs.s32 %r189, %r188;
setp.gt.s32 %p10, %r189, %r65;
add.s32 %r70, %r9, %r10;
and.b32 %r190, %r3, 255;
and.b32 %r191, %r7, 255;
add.s32 %r71, %r191, %r190;
@%p10 bra $L__BB2_7;
bra.uni $L__BB2_6;
$L__BB2_7:
mul.lo.s32 %r193, %r63, 5570;
add.s32 %r194, %r67, %r68;
mad.lo.s32 %r195, %r194, -3801, %r193;
and.b32 %r196, %r11, 255;
and.b32 %r197, %r27, 255;
add.s32 %r198, %r197, %r196;
and.b32 %r199, %r59, 255;
add.s32 %r200, %r198, %r199;
add.s32 %r201, %r200, %r199;
mad.lo.s32 %r202, %r201, 1016, %r195;
shr.s32 %r203, %r202, 2;
mul.lo.s32 %r204, %r70, 4309;
mad.lo.s32 %r205, %r71, -213, %r204;
add.s32 %r279, %r205, %r203;
bra.uni $L__BB2_8;
$L__BB2_6:
mul.lo.s32 %r192, %r70, 5077;
mad.lo.s32 %r279, %r71, -981, %r192;
$L__BB2_8:
add.s32 %r206, %r69, %r280;
shr.s32 %r207, %r279, 13;
setp.gt.s32 %p11, %r207, %r206;
sub.s32 %r208, %r280, %r69;
max.s32 %r209, %r207, %r208;
selp.b32 %r210, %r206, %r209, %p11;
setp.lt.s32 %p12, %r210, 0;
min.s32 %r211, %r210, %r97;
selp.b32 %r280, 0, %r211, %p12;
$L__BB2_9:
and.b32 %r212, %r52, 255;
and.b32 %r213, %r20, 255;
add.s32 %r77, %r212, %r213;
shr.u32 %r282, %r77, 1;
sub.s32 %r214, %r213, %r212;
abs.s32 %r79, %r214;
and.b32 %r215, %r36, 255;
and.b32 %r80, %r6, 255;
sub.s32 %r216, %r215, %r80;
abs.s32 %r217, %r216;
and.b32 %r218, %r32, 255;
and.b32 %r81, %r5, 255;
sub.s32 %r219, %r218, %r81;
abs.s32 %r220, %r219;
add.s32 %r221, %r217, %r220;
shr.u32 %r222, %r221, 1;
and.b32 %r223, %r44, 255;
sub.s32 %r224, %r223, %r80;
abs.s32 %r225, %r224;
and.b32 %r226, %r40, 255;
sub.s32 %r227, %r226, %r81;
abs.s32 %r228, %r227;
add.s32 %r229, %r225, %r228;
shr.u32 %r230, %r229, 1;
shr.u32 %r231, %r79, 1;
max.u32 %r232, %r231, %r222;
max.s32 %r82, %r232, %r230;
setp.eq.s32 %p13, %r82, 0;
@%p13 bra $L__BB2_14;
and.b32 %r233, %r24, 255;
and.b32 %r234, %r56, 255;
add.s32 %r83, %r234, %r233;
shr.u32 %r235, %r83, 1;
sub.s32 %r236, %r235, %r80;
and.b32 %r237, %r48, 255;
and.b32 %r238, %r16, 255;
add.s32 %r84, %r237, %r238;
shr.u32 %r239, %r84, 1;
sub.s32 %r240, %r239, %r81;
min.s32 %r241, %r236, %r240;
sub.s32 %r242, %r282, %r81;
sub.s32 %r243, %r282, %r80;
max.s32 %r244, %r242, %r243;
max.s32 %r245, %r244, %r241;
max.s32 %r246, %r236, %r240;
min.s32 %r247, %r242, %r243;
min.s32 %r248, %r247, %r246;
neg.s32 %r249, %r245;
max.s32 %r250, %r82, %r248;
max.s32 %r85, %r250, %r249;
sub.s32 %r251, %r80, %r81;
abs.s32 %r252, %r251;
setp.gt.s32 %p14, %r252, %r79;
add.s32 %r86, %r80, %r81;
and.b32 %r253, %r4, 255;
and.b32 %r254, %r8, 255;
add.s32 %r87, %r254, %r253;
@%p14 bra $L__BB2_12;
bra.uni $L__BB2_11;
$L__BB2_12:
mul.lo.s32 %r256, %r77, 5570;
add.s32 %r257, %r83, %r84;
mad.lo.s32 %r258, %r257, -3801, %r256;
and.b32 %r259, %r12, 255;
and.b32 %r260, %r28, 255;
add.s32 %r261, %r260, %r259;
and.b32 %r262, %r60, 255;
add.s32 %r263, %r261, %r262;
add.s32 %r264, %r263, %r262;
mad.lo.s32 %r265, %r264, 1016, %r258;
shr.s32 %r266, %r265, 2;
mul.lo.s32 %r267, %r86, 4309;
mad.lo.s32 %r268, %r87, -213, %r267;
add.s32 %r281, %r268, %r266;
bra.uni $L__BB2_13;
$L__BB2_11:
mul.lo.s32 %r255, %r86, 5077;
mad.lo.s32 %r281, %r87, -981, %r255;
$L__BB2_13:
add.s32 %r269, %r85, %r282;
shr.s32 %r270, %r281, 13;
setp.gt.s32 %p15, %r270, %r269;
sub.s32 %r271, %r282, %r85;
max.s32 %r272, %r270, %r271;
selp.b32 %r273, %r269, %r272, %p15;
setp.lt.s32 %p16, %r273, 0;
min.s32 %r274, %r273, %r97;
selp.b32 %r282, 0, %r274, %p16;
$L__BB2_14:
cvt.u16.u32 %rs3, %r282;
cvt.u16.u32 %rs4, %r280;
st.global.v2.u8 [%rd1], {%rs4, %rs3};
$L__BB2_16:
ret;
}
// .globl bwdif_ushort2
.visible .entry bwdif_ushort2(
.param .u64 bwdif_ushort2_param_0,
.param .u64 bwdif_ushort2_param_1,
.param .u64 bwdif_ushort2_param_2,
.param .u64 bwdif_ushort2_param_3,
.param .u32 bwdif_ushort2_param_4,
.param .u32 bwdif_ushort2_param_5,
.param .u32 bwdif_ushort2_param_6,
.param .u32 bwdif_ushort2_param_7,
.param .u32 bwdif_ushort2_param_8,
.param .u32 bwdif_ushort2_param_9,
.param .u32 bwdif_ushort2_param_10,
.param .u32 bwdif_ushort2_param_11,
.param .u32 bwdif_ushort2_param_12
)
{
.reg .pred %p<17>;
.reg .b16 %rs<7>;
.reg .f32 %f<12>;
.reg .b32 %r<283>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [bwdif_ushort2_param_0];
ld.param.u64 %rd3, [bwdif_ushort2_param_1];
ld.param.u64 %rd4, [bwdif_ushort2_param_2];
ld.param.u64 %rd5, [bwdif_ushort2_param_3];
ld.param.u32 %r98, [bwdif_ushort2_param_4];
ld.param.u32 %r99, [bwdif_ushort2_param_5];
ld.param.u32 %r93, [bwdif_ushort2_param_6];
ld.param.u32 %r94, [bwdif_ushort2_param_9];
ld.param.u32 %r95, [bwdif_ushort2_param_10];
ld.param.u32 %r96, [bwdif_ushort2_param_11];
ld.param.u32 %r97, [bwdif_ushort2_param_12];
mov.u32 %r100, %ctaid.x;
mov.u32 %r101, %ntid.x;
mov.u32 %r102, %tid.x;
mad.lo.s32 %r1, %r100, %r101, %r102;
mov.u32 %r103, %ntid.y;
mov.u32 %r104, %ctaid.y;
mov.u32 %r105, %tid.y;
mad.lo.s32 %r2, %r104, %r103, %r105;
setp.ge.s32 %p1, %r1, %r98;
setp.ge.s32 %p2, %r2, %r99;
or.pred %p3, %p1, %p2;
@%p3 bra $L__BB3_16;
cvta.to.global.u64 %rd6, %rd2;
shr.u32 %r106, %r2, 31;
add.s32 %r107, %r2, %r106;
and.b32 %r108, %r107, -2;
sub.s32 %r109, %r2, %r108;
setp.eq.s32 %p4, %r109, %r94;
mad.lo.s32 %r110, %r2, %r93, %r1;
mul.wide.s32 %rd7, %r110, 4;
add.s64 %rd1, %rd6, %rd7;
cvt.rn.f32.s32 %f1, %r1;
@%p4 bra $L__BB3_15;
bra.uni $L__BB3_2;
$L__BB3_15:
cvt.rn.f32.s32 %f11, %r2;
tex.2d.v4.u32.f32 {%r275, %r276, %r277, %r278}, [%rd4, {%f1, %f11}];
cvt.u16.u32 %rs5, %r276;
cvt.u16.u32 %rs6, %r275;
st.global.v2.u16 [%rd1], {%rs6, %rs5};
bra.uni $L__BB3_16;
$L__BB3_2:
add.s32 %r111, %r2, 3;
cvt.rn.f32.s32 %f4, %r111;
tex.2d.v4.u32.f32 {%r3, %r4, %r112, %r113}, [%rd4, {%f1, %f4}];
add.s32 %r114, %r2, 1;
cvt.rn.f32.s32 %f2, %r114;
tex.2d.v4.u32.f32 {%r115, %r5, %r116, %r117}, [%rd4, {%f1, %f2}];
add.s32 %r118, %r2, -1;
cvt.rn.f32.s32 %f3, %r118;
tex.2d.v4.u32.f32 {%r119, %r6, %r120, %r121}, [%rd4, {%f1, %f3}];
add.s32 %r122, %r2, -3;
cvt.rn.f32.s32 %f5, %r122;
tex.2d.v4.u32.f32 {%r7, %r8, %r123, %r124}, [%rd4, {%f1, %f5}];
and.b32 %r9, %r119, 65535;
and.b32 %r10, %r115, 65535;
setp.eq.s32 %p5, %r96, 0;
@%p5 bra $L__BB3_4;
add.s32 %r125, %r9, %r10;
mul.lo.s32 %r126, %r125, 5077;
and.b32 %r127, %r3, 65535;
and.b32 %r128, %r7, 65535;
add.s32 %r129, %r128, %r127;
mad.lo.s32 %r130, %r129, -981, %r126;
shr.s32 %r131, %r130, 13;
setp.lt.s32 %p6, %r130, 0;
min.s32 %r132, %r131, %r97;
selp.b32 %r133, 0, %r132, %p6;
and.b32 %r134, %r5, 65535;
and.b32 %r135, %r6, 65535;
add.s32 %r136, %r135, %r134;
mul.lo.s32 %r137, %r136, 5077;
and.b32 %r138, %r4, 65535;
and.b32 %r139, %r8, 65535;
add.s32 %r140, %r139, %r138;
mad.lo.s32 %r141, %r140, -981, %r137;
shr.s32 %r142, %r141, 13;
setp.lt.s32 %p7, %r141, 0;
min.s32 %r143, %r142, %r97;
selp.b32 %r144, 0, %r143, %p7;
cvt.u16.u32 %rs1, %r133;
cvt.u16.u32 %rs2, %r144;
st.global.v2.u16 [%rd1], {%rs1, %rs2};
bra.uni $L__BB3_16;
$L__BB3_4:
setp.eq.s32 %p8, %r94, %r95;
selp.b64 %rd8, %rd4, %rd3, %p8;
selp.b64 %rd9, %rd5, %rd4, %p8;
add.s32 %r145, %r2, 4;
cvt.rn.f32.s32 %f6, %r145;
tex.2d.v4.u32.f32 {%r11, %r12, %r13, %r14}, [%rd3, {%f1, %f6}];
add.s32 %r146, %r2, 2;
cvt.rn.f32.s32 %f7, %r146;
tex.2d.v4.u32.f32 {%r15, %r16, %r17, %r18}, [%rd3, {%f1, %f7}];
cvt.rn.f32.s32 %f8, %r2;
tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [%rd3, {%f1, %f8}];
add.s32 %r147, %r2, -2;
cvt.rn.f32.s32 %f9, %r147;
tex.2d.v4.u32.f32 {%r23, %r24, %r25, %r26}, [%rd3, {%f1, %f9}];
add.s32 %r148, %r2, -4;
cvt.rn.f32.s32 %f10, %r148;
tex.2d.v4.u32.f32 {%r27, %r28, %r29, %r30}, [%rd3, {%f1, %f10}];
tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [%rd8, {%f1, %f2}];
tex.2d.v4.u32.f32 {%r35, %r36, %r37, %r38}, [%rd8, {%f1, %f3}];
tex.2d.v4.u32.f32 {%r39, %r40, %r41, %r42}, [%rd9, {%f1, %f2}];
tex.2d.v4.u32.f32 {%r43, %r44, %r45, %r46}, [%rd9, {%f1, %f3}];
tex.2d.v4.u32.f32 {%r47, %r48, %r49, %r50}, [%rd5, {%f1, %f7}];
tex.2d.v4.u32.f32 {%r51, %r52, %r53, %r54}, [%rd5, {%f1, %f8}];
tex.2d.v4.u32.f32 {%r55, %r56, %r57, %r58}, [%rd5, {%f1, %f9}];
tex.2d.v4.u32.f32 {%r59, %r60, %r61, %r62}, [%rd5, {%f1, %f10}];
and.b32 %r149, %r19, 65535;
and.b32 %r150, %r51, 65535;
add.s32 %r63, %r150, %r149;
shr.u32 %r280, %r63, 1;
sub.s32 %r151, %r149, %r150;
abs.s32 %r65, %r151;
and.b32 %r152, %r35, 65535;
sub.s32 %r153, %r152, %r9;
abs.s32 %r154, %r153;
and.b32 %r155, %r31, 65535;
sub.s32 %r156, %r155, %r10;
abs.s32 %r157, %r156;
add.s32 %r158, %r154, %r157;
shr.u32 %r159, %r158, 1;
and.b32 %r160, %r43, 65535;
sub.s32 %r161, %r160, %r9;
abs.s32 %r162, %r161;
and.b32 %r163, %r39, 65535;
sub.s32 %r164, %r163, %r10;
abs.s32 %r165, %r164;
add.s32 %r166, %r162, %r165;
shr.u32 %r167, %r166, 1;
shr.u32 %r168, %r65, 1;
max.u32 %r169, %r168, %r159;
max.s32 %r66, %r169, %r167;
setp.eq.s32 %p9, %r66, 0;
@%p9 bra $L__BB3_9;
and.b32 %r170, %r23, 65535;
and.b32 %r171, %r55, 65535;
add.s32 %r67, %r171, %r170;
shr.u32 %r172, %r67, 1;
sub.s32 %r173, %r172, %r9;
and.b32 %r174, %r15, 65535;
and.b32 %r175, %r47, 65535;
add.s32 %r68, %r175, %r174;
shr.u32 %r176, %r68, 1;
sub.s32 %r177, %r176, %r10;
min.s32 %r178, %r173, %r177;
sub.s32 %r179, %r280, %r10;
sub.s32 %r180, %r280, %r9;
max.s32 %r181, %r179, %r180;
max.s32 %r182, %r181, %r178;
max.s32 %r183, %r173, %r177;
min.s32 %r184, %r179, %r180;
min.s32 %r185, %r184, %r183;
neg.s32 %r186, %r182;
max.s32 %r187, %r66, %r185;
max.s32 %r69, %r187, %r186;
sub.s32 %r188, %r9, %r10;
abs.s32 %r189, %r188;
setp.gt.s32 %p10, %r189, %r65;
add.s32 %r70, %r9, %r10;
and.b32 %r190, %r3, 65535;
and.b32 %r191, %r7, 65535;
add.s32 %r71, %r191, %r190;
@%p10 bra $L__BB3_7;
bra.uni $L__BB3_6;
$L__BB3_7:
mul.lo.s32 %r193, %r63, 5570;
add.s32 %r194, %r67, %r68;
mad.lo.s32 %r195, %r194, -3801, %r193;
and.b32 %r196, %r11, 65535;
and.b32 %r197, %r27, 65535;
add.s32 %r198, %r197, %r196;
and.b32 %r199, %r59, 65535;
add.s32 %r200, %r198, %r199;
add.s32 %r201, %r200, %r199;
mad.lo.s32 %r202, %r201, 1016, %r195;
shr.s32 %r203, %r202, 2;
mul.lo.s32 %r204, %r70, 4309;
mad.lo.s32 %r205, %r71, -213, %r204;
add.s32 %r279, %r205, %r203;
bra.uni $L__BB3_8;
$L__BB3_6:
mul.lo.s32 %r192, %r70, 5077;
mad.lo.s32 %r279, %r71, -981, %r192;
$L__BB3_8:
add.s32 %r206, %r69, %r280;
shr.s32 %r207, %r279, 13;
setp.gt.s32 %p11, %r207, %r206;
sub.s32 %r208, %r280, %r69;
max.s32 %r209, %r207, %r208;
selp.b32 %r210, %r206, %r209, %p11;
setp.lt.s32 %p12, %r210, 0;
min.s32 %r211, %r210, %r97;
selp.b32 %r280, 0, %r211, %p12;
$L__BB3_9:
and.b32 %r212, %r52, 65535;
and.b32 %r213, %r20, 65535;
add.s32 %r77, %r212, %r213;
shr.u32 %r282, %r77, 1;
sub.s32 %r214, %r213, %r212;
abs.s32 %r79, %r214;
and.b32 %r215, %r36, 65535;
and.b32 %r80, %r6, 65535;
sub.s32 %r216, %r215, %r80;
abs.s32 %r217, %r216;
and.b32 %r218, %r32, 65535;
and.b32 %r81, %r5, 65535;
sub.s32 %r219, %r218, %r81;
abs.s32 %r220, %r219;
add.s32 %r221, %r217, %r220;
shr.u32 %r222, %r221, 1;
and.b32 %r223, %r44, 65535;
sub.s32 %r224, %r223, %r80;
abs.s32 %r225, %r224;
and.b32 %r226, %r40, 65535;
sub.s32 %r227, %r226, %r81;
abs.s32 %r228, %r227;
add.s32 %r229, %r225, %r228;
shr.u32 %r230, %r229, 1;
shr.u32 %r231, %r79, 1;
max.u32 %r232, %r231, %r222;
max.s32 %r82, %r232, %r230;
setp.eq.s32 %p13, %r82, 0;
@%p13 bra $L__BB3_14;
and.b32 %r233, %r24, 65535;
and.b32 %r234, %r56, 65535;
add.s32 %r83, %r234, %r233;
shr.u32 %r235, %r83, 1;
sub.s32 %r236, %r235, %r80;
and.b32 %r237, %r48, 65535;
and.b32 %r238, %r16, 65535;
add.s32 %r84, %r237, %r238;
shr.u32 %r239, %r84, 1;
sub.s32 %r240, %r239, %r81;
min.s32 %r241, %r236, %r240;
sub.s32 %r242, %r282, %r81;
sub.s32 %r243, %r282, %r80;
max.s32 %r244, %r242, %r243;
max.s32 %r245, %r244, %r241;
max.s32 %r246, %r236, %r240;
min.s32 %r247, %r242, %r243;
min.s32 %r248, %r247, %r246;
neg.s32 %r249, %r245;
max.s32 %r250, %r82, %r248;
max.s32 %r85, %r250, %r249;
sub.s32 %r251, %r80, %r81;
abs.s32 %r252, %r251;
setp.gt.s32 %p14, %r252, %r79;
add.s32 %r86, %r80, %r81;
and.b32 %r253, %r4, 65535;
and.b32 %r254, %r8, 65535;
add.s32 %r87, %r254, %r253;
@%p14 bra $L__BB3_12;
bra.uni $L__BB3_11;
$L__BB3_12:
mul.lo.s32 %r256, %r77, 5570;
add.s32 %r257, %r83, %r84;
mad.lo.s32 %r258, %r257, -3801, %r256;
and.b32 %r259, %r12, 65535;
and.b32 %r260, %r28, 65535;
add.s32 %r261, %r260, %r259;
and.b32 %r262, %r60, 65535;
add.s32 %r263, %r261, %r262;
add.s32 %r264, %r263, %r262;
mad.lo.s32 %r265, %r264, 1016, %r258;
shr.s32 %r266, %r265, 2;
mul.lo.s32 %r267, %r86, 4309;
mad.lo.s32 %r268, %r87, -213, %r267;
add.s32 %r281, %r268, %r266;
bra.uni $L__BB3_13;
$L__BB3_11:
mul.lo.s32 %r255, %r86, 5077;
mad.lo.s32 %r281, %r87, -981, %r255;
$L__BB3_13:
add.s32 %r269, %r85, %r282;
shr.s32 %r270, %r281, 13;
setp.gt.s32 %p15, %r270, %r269;
sub.s32 %r271, %r282, %r85;
max.s32 %r272, %r270, %r271;
selp.b32 %r273, %r269, %r272, %p15;
setp.lt.s32 %p16, %r273, 0;
min.s32 %r274, %r273, %r97;
selp.b32 %r282, 0, %r274, %p16;
$L__BB3_14:
cvt.u16.u32 %rs3, %r282;
cvt.u16.u32 %rs4, %r280;
st.global.v2.u16 [%rd1], {%rs4, %rs3};
$L__BB3_16:
ret;
}