|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.version 7.8 |
|
.target sm_60 |
|
.address_size 64 |
|
|
|
|
|
.global .align 4 .b8 coef_lf[8] = {213, 16, 0, 0, 213, 0, 0, 0}; |
|
.global .align 4 .b8 coef_hf[12] = {194, 21, 0, 0, 217, 14, 0, 0, 248, 3, 0, 0}; |
|
.global .align 4 .b8 coef_sp[8] = {213, 19, 0, 0, 213, 3, 0, 0}; |
|
|
|
.visible .entry bwdif_uchar( |
|
.param .u64 bwdif_uchar_param_0, |
|
.param .u64 bwdif_uchar_param_1, |
|
.param .u64 bwdif_uchar_param_2, |
|
.param .u64 bwdif_uchar_param_3, |
|
.param .u32 bwdif_uchar_param_4, |
|
.param .u32 bwdif_uchar_param_5, |
|
.param .u32 bwdif_uchar_param_6, |
|
.param .u32 bwdif_uchar_param_7, |
|
.param .u32 bwdif_uchar_param_8, |
|
.param .u32 bwdif_uchar_param_9, |
|
.param .u32 bwdif_uchar_param_10, |
|
.param .u32 bwdif_uchar_param_11, |
|
.param .u32 bwdif_uchar_param_12 |
|
) |
|
{ |
|
.reg .pred %p<12>; |
|
.reg .f32 %f<12>; |
|
.reg .b32 %r<191>; |
|
.reg .b64 %rd<10>; |
|
|
|
|
|
ld.param.u64 %rd2, [bwdif_uchar_param_0]; |
|
ld.param.u64 %rd3, [bwdif_uchar_param_1]; |
|
ld.param.u64 %rd4, [bwdif_uchar_param_2]; |
|
ld.param.u64 %rd5, [bwdif_uchar_param_3]; |
|
ld.param.u32 %r54, [bwdif_uchar_param_4]; |
|
ld.param.u32 %r55, [bwdif_uchar_param_5]; |
|
ld.param.u32 %r49, [bwdif_uchar_param_6]; |
|
ld.param.u32 %r50, [bwdif_uchar_param_9]; |
|
ld.param.u32 %r51, [bwdif_uchar_param_10]; |
|
ld.param.u32 %r52, [bwdif_uchar_param_11]; |
|
ld.param.u32 %r53, [bwdif_uchar_param_12]; |
|
mov.u32 %r56, %ctaid.x; |
|
mov.u32 %r57, %ntid.x; |
|
mov.u32 %r58, %tid.x; |
|
mad.lo.s32 %r1, %r56, %r57, %r58; |
|
mov.u32 %r59, %ntid.y; |
|
mov.u32 %r60, %ctaid.y; |
|
mov.u32 %r61, %tid.y; |
|
mad.lo.s32 %r2, %r60, %r59, %r61; |
|
setp.ge.s32 %p1, %r1, %r54; |
|
setp.ge.s32 %p2, %r2, %r55; |
|
or.pred %p3, %p1, %p2; |
|
@%p3 bra $L__BB0_11; |
|
|
|
cvta.to.global.u64 %rd6, %rd2; |
|
shr.u32 %r62, %r2, 31; |
|
add.s32 %r63, %r2, %r62; |
|
and.b32 %r64, %r63, -2; |
|
sub.s32 %r65, %r2, %r64; |
|
setp.eq.s32 %p4, %r65, %r50; |
|
cvt.rn.f32.s32 %f1, %r1; |
|
mad.lo.s32 %r66, %r2, %r49, %r1; |
|
cvt.s64.s32 %rd7, %r66; |
|
add.s64 %rd1, %rd6, %rd7; |
|
@%p4 bra $L__BB0_10; |
|
bra.uni $L__BB0_2; |
|
|
|
$L__BB0_10: |
|
cvt.rn.f32.s32 %f11, %r2; |
|
tex.2d.v4.u32.f32 {%r185, %r186, %r187, %r188}, [%rd4, {%f1, %f11}]; |
|
st.global.u8 [%rd1], %r185; |
|
bra.uni $L__BB0_11; |
|
|
|
$L__BB0_2: |
|
add.s32 %r67, %r2, 3; |
|
cvt.rn.f32.s32 %f4, %r67; |
|
tex.2d.v4.u32.f32 {%r3, %r68, %r69, %r70}, [%rd4, {%f1, %f4}]; |
|
add.s32 %r71, %r2, 1; |
|
cvt.rn.f32.s32 %f2, %r71; |
|
tex.2d.v4.u32.f32 {%r72, %r73, %r74, %r75}, [%rd4, {%f1, %f2}]; |
|
add.s32 %r76, %r2, -1; |
|
cvt.rn.f32.s32 %f3, %r76; |
|
tex.2d.v4.u32.f32 {%r77, %r78, %r79, %r80}, [%rd4, {%f1, %f3}]; |
|
add.s32 %r81, %r2, -3; |
|
cvt.rn.f32.s32 %f5, %r81; |
|
tex.2d.v4.u32.f32 {%r4, %r82, %r83, %r84}, [%rd4, {%f1, %f5}]; |
|
and.b32 %r5, %r77, 255; |
|
and.b32 %r6, %r72, 255; |
|
setp.eq.s32 %p5, %r52, 0; |
|
@%p5 bra $L__BB0_4; |
|
|
|
add.s32 %r85, %r5, %r6; |
|
mul.lo.s32 %r86, %r85, 5077; |
|
and.b32 %r87, %r3, 255; |
|
and.b32 %r88, %r4, 255; |
|
add.s32 %r89, %r88, %r87; |
|
mad.lo.s32 %r90, %r89, -981, %r86; |
|
shr.s32 %r91, %r90, 13; |
|
setp.lt.s32 %p6, %r90, 0; |
|
min.s32 %r92, %r91, %r53; |
|
selp.b32 %r93, 0, %r92, %p6; |
|
st.global.u8 [%rd1], %r93; |
|
bra.uni $L__BB0_11; |
|
|
|
$L__BB0_4: |
|
setp.eq.s32 %p7, %r50, %r51; |
|
selp.b64 %rd8, %rd4, %rd3, %p7; |
|
selp.b64 %rd9, %rd5, %rd4, %p7; |
|
add.s32 %r94, %r2, 4; |
|
cvt.rn.f32.s32 %f6, %r94; |
|
tex.2d.v4.u32.f32 {%r7, %r8, %r9, %r10}, [%rd3, {%f1, %f6}]; |
|
add.s32 %r95, %r2, 2; |
|
cvt.rn.f32.s32 %f7, %r95; |
|
tex.2d.v4.u32.f32 {%r11, %r12, %r13, %r14}, [%rd3, {%f1, %f7}]; |
|
cvt.rn.f32.s32 %f8, %r2; |
|
tex.2d.v4.u32.f32 {%r96, %r97, %r98, %r99}, [%rd3, {%f1, %f8}]; |
|
add.s32 %r100, %r2, -2; |
|
cvt.rn.f32.s32 %f9, %r100; |
|
tex.2d.v4.u32.f32 {%r15, %r16, %r17, %r18}, [%rd3, {%f1, %f9}]; |
|
add.s32 %r101, %r2, -4; |
|
cvt.rn.f32.s32 %f10, %r101; |
|
tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [%rd3, {%f1, %f10}]; |
|
tex.2d.v4.u32.f32 {%r102, %r103, %r104, %r105}, [%rd8, {%f1, %f2}]; |
|
tex.2d.v4.u32.f32 {%r106, %r107, %r108, %r109}, [%rd8, {%f1, %f3}]; |
|
tex.2d.v4.u32.f32 {%r110, %r111, %r112, %r113}, [%rd9, {%f1, %f2}]; |
|
tex.2d.v4.u32.f32 {%r114, %r115, %r116, %r117}, [%rd9, {%f1, %f3}]; |
|
tex.2d.v4.u32.f32 {%r23, %r24, %r25, %r26}, [%rd5, {%f1, %f7}]; |
|
tex.2d.v4.u32.f32 {%r118, %r119, %r120, %r121}, [%rd5, {%f1, %f8}]; |
|
tex.2d.v4.u32.f32 {%r27, %r28, %r29, %r30}, [%rd5, {%f1, %f9}]; |
|
tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [%rd5, {%f1, %f10}]; |
|
and.b32 %r122, %r96, 255; |
|
and.b32 %r123, %r118, 255; |
|
add.s32 %r35, %r123, %r122; |
|
shr.u32 %r190, %r35, 1; |
|
sub.s32 %r124, %r122, %r123; |
|
abs.s32 %r37, %r124; |
|
and.b32 %r125, %r106, 255; |
|
sub.s32 %r126, %r125, %r5; |
|
abs.s32 %r127, %r126; |
|
and.b32 %r128, %r102, 255; |
|
sub.s32 %r129, %r128, %r6; |
|
abs.s32 %r130, %r129; |
|
add.s32 %r131, %r127, %r130; |
|
shr.u32 %r132, %r131, 1; |
|
and.b32 %r133, %r114, 255; |
|
sub.s32 %r134, %r133, %r5; |
|
abs.s32 %r135, %r134; |
|
and.b32 %r136, %r110, 255; |
|
sub.s32 %r137, %r136, %r6; |
|
abs.s32 %r138, %r137; |
|
add.s32 %r139, %r135, %r138; |
|
shr.u32 %r140, %r139, 1; |
|
shr.u32 %r141, %r37, 1; |
|
max.u32 %r142, %r141, %r132; |
|
max.s32 %r38, %r142, %r140; |
|
setp.eq.s32 %p8, %r38, 0; |
|
@%p8 bra $L__BB0_9; |
|
|
|
and.b32 %r143, %r15, 255; |
|
and.b32 %r144, %r27, 255; |
|
add.s32 %r39, %r144, %r143; |
|
shr.u32 %r145, %r39, 1; |
|
sub.s32 %r146, %r145, %r5; |
|
and.b32 %r147, %r11, 255; |
|
and.b32 %r148, %r23, 255; |
|
add.s32 %r40, %r148, %r147; |
|
shr.u32 %r149, %r40, 1; |
|
sub.s32 %r150, %r149, %r6; |
|
min.s32 %r151, %r146, %r150; |
|
sub.s32 %r152, %r190, %r6; |
|
sub.s32 %r153, %r190, %r5; |
|
max.s32 %r154, %r152, %r153; |
|
max.s32 %r155, %r154, %r151; |
|
max.s32 %r156, %r146, %r150; |
|
min.s32 %r157, %r152, %r153; |
|
min.s32 %r158, %r157, %r156; |
|
neg.s32 %r159, %r155; |
|
max.s32 %r160, %r38, %r158; |
|
max.s32 %r41, %r160, %r159; |
|
sub.s32 %r161, %r5, %r6; |
|
abs.s32 %r162, %r161; |
|
setp.gt.s32 %p9, %r162, %r37; |
|
add.s32 %r42, %r5, %r6; |
|
and.b32 %r163, %r3, 255; |
|
and.b32 %r164, %r4, 255; |
|
add.s32 %r43, %r164, %r163; |
|
@%p9 bra $L__BB0_7; |
|
bra.uni $L__BB0_6; |
|
|
|
$L__BB0_7: |
|
mul.lo.s32 %r166, %r35, 5570; |
|
add.s32 %r167, %r39, %r40; |
|
mad.lo.s32 %r168, %r167, -3801, %r166; |
|
and.b32 %r169, %r7, 255; |
|
and.b32 %r170, %r19, 255; |
|
add.s32 %r171, %r170, %r169; |
|
and.b32 %r172, %r31, 255; |
|
add.s32 %r173, %r171, %r172; |
|
add.s32 %r174, %r173, %r172; |
|
mad.lo.s32 %r175, %r174, 1016, %r168; |
|
shr.s32 %r176, %r175, 2; |
|
mul.lo.s32 %r177, %r42, 4309; |
|
mad.lo.s32 %r178, %r43, -213, %r177; |
|
add.s32 %r189, %r178, %r176; |
|
bra.uni $L__BB0_8; |
|
|
|
$L__BB0_6: |
|
mul.lo.s32 %r165, %r42, 5077; |
|
mad.lo.s32 %r189, %r43, -981, %r165; |
|
|
|
$L__BB0_8: |
|
add.s32 %r179, %r41, %r190; |
|
shr.s32 %r180, %r189, 13; |
|
setp.gt.s32 %p10, %r180, %r179; |
|
sub.s32 %r181, %r190, %r41; |
|
max.s32 %r182, %r180, %r181; |
|
selp.b32 %r183, %r179, %r182, %p10; |
|
setp.lt.s32 %p11, %r183, 0; |
|
min.s32 %r184, %r183, %r53; |
|
selp.b32 %r190, 0, %r184, %p11; |
|
|
|
$L__BB0_9: |
|
st.global.u8 [%rd1], %r190; |
|
|
|
$L__BB0_11: |
|
ret; |
|
|
|
} |
|
|
|
.visible .entry bwdif_ushort( |
|
.param .u64 bwdif_ushort_param_0, |
|
.param .u64 bwdif_ushort_param_1, |
|
.param .u64 bwdif_ushort_param_2, |
|
.param .u64 bwdif_ushort_param_3, |
|
.param .u32 bwdif_ushort_param_4, |
|
.param .u32 bwdif_ushort_param_5, |
|
.param .u32 bwdif_ushort_param_6, |
|
.param .u32 bwdif_ushort_param_7, |
|
.param .u32 bwdif_ushort_param_8, |
|
.param .u32 bwdif_ushort_param_9, |
|
.param .u32 bwdif_ushort_param_10, |
|
.param .u32 bwdif_ushort_param_11, |
|
.param .u32 bwdif_ushort_param_12 |
|
) |
|
{ |
|
.reg .pred %p<12>; |
|
.reg .f32 %f<12>; |
|
.reg .b32 %r<191>; |
|
.reg .b64 %rd<10>; |
|
|
|
|
|
ld.param.u64 %rd2, [bwdif_ushort_param_0]; |
|
ld.param.u64 %rd3, [bwdif_ushort_param_1]; |
|
ld.param.u64 %rd4, [bwdif_ushort_param_2]; |
|
ld.param.u64 %rd5, [bwdif_ushort_param_3]; |
|
ld.param.u32 %r54, [bwdif_ushort_param_4]; |
|
ld.param.u32 %r55, [bwdif_ushort_param_5]; |
|
ld.param.u32 %r49, [bwdif_ushort_param_6]; |
|
ld.param.u32 %r50, [bwdif_ushort_param_9]; |
|
ld.param.u32 %r51, [bwdif_ushort_param_10]; |
|
ld.param.u32 %r52, [bwdif_ushort_param_11]; |
|
ld.param.u32 %r53, [bwdif_ushort_param_12]; |
|
mov.u32 %r56, %ctaid.x; |
|
mov.u32 %r57, %ntid.x; |
|
mov.u32 %r58, %tid.x; |
|
mad.lo.s32 %r1, %r56, %r57, %r58; |
|
mov.u32 %r59, %ntid.y; |
|
mov.u32 %r60, %ctaid.y; |
|
mov.u32 %r61, %tid.y; |
|
mad.lo.s32 %r2, %r60, %r59, %r61; |
|
setp.ge.s32 %p1, %r1, %r54; |
|
setp.ge.s32 %p2, %r2, %r55; |
|
or.pred %p3, %p1, %p2; |
|
@%p3 bra $L__BB1_11; |
|
|
|
cvta.to.global.u64 %rd6, %rd2; |
|
shr.u32 %r62, %r2, 31; |
|
add.s32 %r63, %r2, %r62; |
|
and.b32 %r64, %r63, -2; |
|
sub.s32 %r65, %r2, %r64; |
|
setp.eq.s32 %p4, %r65, %r50; |
|
cvt.rn.f32.s32 %f1, %r1; |
|
mad.lo.s32 %r66, %r2, %r49, %r1; |
|
mul.wide.s32 %rd7, %r66, 2; |
|
add.s64 %rd1, %rd6, %rd7; |
|
@%p4 bra $L__BB1_10; |
|
bra.uni $L__BB1_2; |
|
|
|
$L__BB1_10: |
|
cvt.rn.f32.s32 %f11, %r2; |
|
tex.2d.v4.u32.f32 {%r185, %r186, %r187, %r188}, [%rd4, {%f1, %f11}]; |
|
st.global.u16 [%rd1], %r185; |
|
bra.uni $L__BB1_11; |
|
|
|
$L__BB1_2: |
|
add.s32 %r67, %r2, 3; |
|
cvt.rn.f32.s32 %f4, %r67; |
|
tex.2d.v4.u32.f32 {%r3, %r68, %r69, %r70}, [%rd4, {%f1, %f4}]; |
|
add.s32 %r71, %r2, 1; |
|
cvt.rn.f32.s32 %f2, %r71; |
|
tex.2d.v4.u32.f32 {%r72, %r73, %r74, %r75}, [%rd4, {%f1, %f2}]; |
|
add.s32 %r76, %r2, -1; |
|
cvt.rn.f32.s32 %f3, %r76; |
|
tex.2d.v4.u32.f32 {%r77, %r78, %r79, %r80}, [%rd4, {%f1, %f3}]; |
|
add.s32 %r81, %r2, -3; |
|
cvt.rn.f32.s32 %f5, %r81; |
|
tex.2d.v4.u32.f32 {%r4, %r82, %r83, %r84}, [%rd4, {%f1, %f5}]; |
|
and.b32 %r5, %r77, 65535; |
|
and.b32 %r6, %r72, 65535; |
|
setp.eq.s32 %p5, %r52, 0; |
|
@%p5 bra $L__BB1_4; |
|
|
|
add.s32 %r85, %r5, %r6; |
|
mul.lo.s32 %r86, %r85, 5077; |
|
and.b32 %r87, %r3, 65535; |
|
and.b32 %r88, %r4, 65535; |
|
add.s32 %r89, %r88, %r87; |
|
mad.lo.s32 %r90, %r89, -981, %r86; |
|
shr.s32 %r91, %r90, 13; |
|
setp.lt.s32 %p6, %r90, 0; |
|
min.s32 %r92, %r91, %r53; |
|
selp.b32 %r93, 0, %r92, %p6; |
|
st.global.u16 [%rd1], %r93; |
|
bra.uni $L__BB1_11; |
|
|
|
$L__BB1_4: |
|
setp.eq.s32 %p7, %r50, %r51; |
|
selp.b64 %rd8, %rd4, %rd3, %p7; |
|
selp.b64 %rd9, %rd5, %rd4, %p7; |
|
add.s32 %r94, %r2, 4; |
|
cvt.rn.f32.s32 %f6, %r94; |
|
tex.2d.v4.u32.f32 {%r7, %r8, %r9, %r10}, [%rd3, {%f1, %f6}]; |
|
add.s32 %r95, %r2, 2; |
|
cvt.rn.f32.s32 %f7, %r95; |
|
tex.2d.v4.u32.f32 {%r11, %r12, %r13, %r14}, [%rd3, {%f1, %f7}]; |
|
cvt.rn.f32.s32 %f8, %r2; |
|
tex.2d.v4.u32.f32 {%r96, %r97, %r98, %r99}, [%rd3, {%f1, %f8}]; |
|
add.s32 %r100, %r2, -2; |
|
cvt.rn.f32.s32 %f9, %r100; |
|
tex.2d.v4.u32.f32 {%r15, %r16, %r17, %r18}, [%rd3, {%f1, %f9}]; |
|
add.s32 %r101, %r2, -4; |
|
cvt.rn.f32.s32 %f10, %r101; |
|
tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [%rd3, {%f1, %f10}]; |
|
tex.2d.v4.u32.f32 {%r102, %r103, %r104, %r105}, [%rd8, {%f1, %f2}]; |
|
tex.2d.v4.u32.f32 {%r106, %r107, %r108, %r109}, [%rd8, {%f1, %f3}]; |
|
tex.2d.v4.u32.f32 {%r110, %r111, %r112, %r113}, [%rd9, {%f1, %f2}]; |
|
tex.2d.v4.u32.f32 {%r114, %r115, %r116, %r117}, [%rd9, {%f1, %f3}]; |
|
tex.2d.v4.u32.f32 {%r23, %r24, %r25, %r26}, [%rd5, {%f1, %f7}]; |
|
tex.2d.v4.u32.f32 {%r118, %r119, %r120, %r121}, [%rd5, {%f1, %f8}]; |
|
tex.2d.v4.u32.f32 {%r27, %r28, %r29, %r30}, [%rd5, {%f1, %f9}]; |
|
tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [%rd5, {%f1, %f10}]; |
|
and.b32 %r122, %r96, 65535; |
|
and.b32 %r123, %r118, 65535; |
|
add.s32 %r35, %r123, %r122; |
|
shr.u32 %r190, %r35, 1; |
|
sub.s32 %r124, %r122, %r123; |
|
abs.s32 %r37, %r124; |
|
and.b32 %r125, %r106, 65535; |
|
sub.s32 %r126, %r125, %r5; |
|
abs.s32 %r127, %r126; |
|
and.b32 %r128, %r102, 65535; |
|
sub.s32 %r129, %r128, %r6; |
|
abs.s32 %r130, %r129; |
|
add.s32 %r131, %r127, %r130; |
|
shr.u32 %r132, %r131, 1; |
|
and.b32 %r133, %r114, 65535; |
|
sub.s32 %r134, %r133, %r5; |
|
abs.s32 %r135, %r134; |
|
and.b32 %r136, %r110, 65535; |
|
sub.s32 %r137, %r136, %r6; |
|
abs.s32 %r138, %r137; |
|
add.s32 %r139, %r135, %r138; |
|
shr.u32 %r140, %r139, 1; |
|
shr.u32 %r141, %r37, 1; |
|
max.u32 %r142, %r141, %r132; |
|
max.s32 %r38, %r142, %r140; |
|
setp.eq.s32 %p8, %r38, 0; |
|
@%p8 bra $L__BB1_9; |
|
|
|
and.b32 %r143, %r15, 65535; |
|
and.b32 %r144, %r27, 65535; |
|
add.s32 %r39, %r144, %r143; |
|
shr.u32 %r145, %r39, 1; |
|
sub.s32 %r146, %r145, %r5; |
|
and.b32 %r147, %r11, 65535; |
|
and.b32 %r148, %r23, 65535; |
|
add.s32 %r40, %r148, %r147; |
|
shr.u32 %r149, %r40, 1; |
|
sub.s32 %r150, %r149, %r6; |
|
min.s32 %r151, %r146, %r150; |
|
sub.s32 %r152, %r190, %r6; |
|
sub.s32 %r153, %r190, %r5; |
|
max.s32 %r154, %r152, %r153; |
|
max.s32 %r155, %r154, %r151; |
|
max.s32 %r156, %r146, %r150; |
|
min.s32 %r157, %r152, %r153; |
|
min.s32 %r158, %r157, %r156; |
|
neg.s32 %r159, %r155; |
|
max.s32 %r160, %r38, %r158; |
|
max.s32 %r41, %r160, %r159; |
|
sub.s32 %r161, %r5, %r6; |
|
abs.s32 %r162, %r161; |
|
setp.gt.s32 %p9, %r162, %r37; |
|
add.s32 %r42, %r5, %r6; |
|
and.b32 %r163, %r3, 65535; |
|
and.b32 %r164, %r4, 65535; |
|
add.s32 %r43, %r164, %r163; |
|
@%p9 bra $L__BB1_7; |
|
bra.uni $L__BB1_6; |
|
|
|
$L__BB1_7: |
|
mul.lo.s32 %r166, %r35, 5570; |
|
add.s32 %r167, %r39, %r40; |
|
mad.lo.s32 %r168, %r167, -3801, %r166; |
|
and.b32 %r169, %r7, 65535; |
|
and.b32 %r170, %r19, 65535; |
|
add.s32 %r171, %r170, %r169; |
|
and.b32 %r172, %r31, 65535; |
|
add.s32 %r173, %r171, %r172; |
|
add.s32 %r174, %r173, %r172; |
|
mad.lo.s32 %r175, %r174, 1016, %r168; |
|
shr.s32 %r176, %r175, 2; |
|
mul.lo.s32 %r177, %r42, 4309; |
|
mad.lo.s32 %r178, %r43, -213, %r177; |
|
add.s32 %r189, %r178, %r176; |
|
bra.uni $L__BB1_8; |
|
|
|
$L__BB1_6: |
|
mul.lo.s32 %r165, %r42, 5077; |
|
mad.lo.s32 %r189, %r43, -981, %r165; |
|
|
|
$L__BB1_8: |
|
add.s32 %r179, %r41, %r190; |
|
shr.s32 %r180, %r189, 13; |
|
setp.gt.s32 %p10, %r180, %r179; |
|
sub.s32 %r181, %r190, %r41; |
|
max.s32 %r182, %r180, %r181; |
|
selp.b32 %r183, %r179, %r182, %p10; |
|
setp.lt.s32 %p11, %r183, 0; |
|
min.s32 %r184, %r183, %r53; |
|
selp.b32 %r190, 0, %r184, %p11; |
|
|
|
$L__BB1_9: |
|
st.global.u16 [%rd1], %r190; |
|
|
|
$L__BB1_11: |
|
ret; |
|
|
|
} |
|
|
|
.visible .entry bwdif_uchar2( |
|
.param .u64 bwdif_uchar2_param_0, |
|
.param .u64 bwdif_uchar2_param_1, |
|
.param .u64 bwdif_uchar2_param_2, |
|
.param .u64 bwdif_uchar2_param_3, |
|
.param .u32 bwdif_uchar2_param_4, |
|
.param .u32 bwdif_uchar2_param_5, |
|
.param .u32 bwdif_uchar2_param_6, |
|
.param .u32 bwdif_uchar2_param_7, |
|
.param .u32 bwdif_uchar2_param_8, |
|
.param .u32 bwdif_uchar2_param_9, |
|
.param .u32 bwdif_uchar2_param_10, |
|
.param .u32 bwdif_uchar2_param_11, |
|
.param .u32 bwdif_uchar2_param_12 |
|
) |
|
{ |
|
.reg .pred %p<17>; |
|
.reg .b16 %rs<7>; |
|
.reg .f32 %f<12>; |
|
.reg .b32 %r<283>; |
|
.reg .b64 %rd<10>; |
|
|
|
|
|
ld.param.u64 %rd2, [bwdif_uchar2_param_0]; |
|
ld.param.u64 %rd3, [bwdif_uchar2_param_1]; |
|
ld.param.u64 %rd4, [bwdif_uchar2_param_2]; |
|
ld.param.u64 %rd5, [bwdif_uchar2_param_3]; |
|
ld.param.u32 %r98, [bwdif_uchar2_param_4]; |
|
ld.param.u32 %r99, [bwdif_uchar2_param_5]; |
|
ld.param.u32 %r93, [bwdif_uchar2_param_6]; |
|
ld.param.u32 %r94, [bwdif_uchar2_param_9]; |
|
ld.param.u32 %r95, [bwdif_uchar2_param_10]; |
|
ld.param.u32 %r96, [bwdif_uchar2_param_11]; |
|
ld.param.u32 %r97, [bwdif_uchar2_param_12]; |
|
mov.u32 %r100, %ctaid.x; |
|
mov.u32 %r101, %ntid.x; |
|
mov.u32 %r102, %tid.x; |
|
mad.lo.s32 %r1, %r100, %r101, %r102; |
|
mov.u32 %r103, %ntid.y; |
|
mov.u32 %r104, %ctaid.y; |
|
mov.u32 %r105, %tid.y; |
|
mad.lo.s32 %r2, %r104, %r103, %r105; |
|
setp.ge.s32 %p1, %r1, %r98; |
|
setp.ge.s32 %p2, %r2, %r99; |
|
or.pred %p3, %p1, %p2; |
|
@%p3 bra $L__BB2_16; |
|
|
|
cvta.to.global.u64 %rd6, %rd2; |
|
shr.u32 %r106, %r2, 31; |
|
add.s32 %r107, %r2, %r106; |
|
and.b32 %r108, %r107, -2; |
|
sub.s32 %r109, %r2, %r108; |
|
setp.eq.s32 %p4, %r109, %r94; |
|
mad.lo.s32 %r110, %r2, %r93, %r1; |
|
mul.wide.s32 %rd7, %r110, 2; |
|
add.s64 %rd1, %rd6, %rd7; |
|
cvt.rn.f32.s32 %f1, %r1; |
|
@%p4 bra $L__BB2_15; |
|
bra.uni $L__BB2_2; |
|
|
|
$L__BB2_15: |
|
cvt.rn.f32.s32 %f11, %r2; |
|
tex.2d.v4.u32.f32 {%r275, %r276, %r277, %r278}, [%rd4, {%f1, %f11}]; |
|
cvt.u16.u32 %rs5, %r276; |
|
cvt.u16.u32 %rs6, %r275; |
|
st.global.v2.u8 [%rd1], {%rs6, %rs5}; |
|
bra.uni $L__BB2_16; |
|
|
|
$L__BB2_2: |
|
add.s32 %r111, %r2, 3; |
|
cvt.rn.f32.s32 %f4, %r111; |
|
tex.2d.v4.u32.f32 {%r3, %r4, %r112, %r113}, [%rd4, {%f1, %f4}]; |
|
add.s32 %r114, %r2, 1; |
|
cvt.rn.f32.s32 %f2, %r114; |
|
tex.2d.v4.u32.f32 {%r115, %r5, %r116, %r117}, [%rd4, {%f1, %f2}]; |
|
add.s32 %r118, %r2, -1; |
|
cvt.rn.f32.s32 %f3, %r118; |
|
tex.2d.v4.u32.f32 {%r119, %r6, %r120, %r121}, [%rd4, {%f1, %f3}]; |
|
add.s32 %r122, %r2, -3; |
|
cvt.rn.f32.s32 %f5, %r122; |
|
tex.2d.v4.u32.f32 {%r7, %r8, %r123, %r124}, [%rd4, {%f1, %f5}]; |
|
and.b32 %r9, %r119, 255; |
|
and.b32 %r10, %r115, 255; |
|
setp.eq.s32 %p5, %r96, 0; |
|
@%p5 bra $L__BB2_4; |
|
|
|
add.s32 %r125, %r9, %r10; |
|
mul.lo.s32 %r126, %r125, 5077; |
|
and.b32 %r127, %r3, 255; |
|
and.b32 %r128, %r7, 255; |
|
add.s32 %r129, %r128, %r127; |
|
mad.lo.s32 %r130, %r129, -981, %r126; |
|
shr.s32 %r131, %r130, 13; |
|
setp.lt.s32 %p6, %r130, 0; |
|
min.s32 %r132, %r131, %r97; |
|
selp.b32 %r133, 0, %r132, %p6; |
|
and.b32 %r134, %r5, 255; |
|
and.b32 %r135, %r6, 255; |
|
add.s32 %r136, %r135, %r134; |
|
mul.lo.s32 %r137, %r136, 5077; |
|
and.b32 %r138, %r4, 255; |
|
and.b32 %r139, %r8, 255; |
|
add.s32 %r140, %r139, %r138; |
|
mad.lo.s32 %r141, %r140, -981, %r137; |
|
shr.s32 %r142, %r141, 13; |
|
setp.lt.s32 %p7, %r141, 0; |
|
min.s32 %r143, %r142, %r97; |
|
selp.b32 %r144, 0, %r143, %p7; |
|
cvt.u16.u32 %rs1, %r133; |
|
cvt.u16.u32 %rs2, %r144; |
|
st.global.v2.u8 [%rd1], {%rs1, %rs2}; |
|
bra.uni $L__BB2_16; |
|
|
|
$L__BB2_4: |
|
setp.eq.s32 %p8, %r94, %r95; |
|
selp.b64 %rd8, %rd4, %rd3, %p8; |
|
selp.b64 %rd9, %rd5, %rd4, %p8; |
|
add.s32 %r145, %r2, 4; |
|
cvt.rn.f32.s32 %f6, %r145; |
|
tex.2d.v4.u32.f32 {%r11, %r12, %r13, %r14}, [%rd3, {%f1, %f6}]; |
|
add.s32 %r146, %r2, 2; |
|
cvt.rn.f32.s32 %f7, %r146; |
|
tex.2d.v4.u32.f32 {%r15, %r16, %r17, %r18}, [%rd3, {%f1, %f7}]; |
|
cvt.rn.f32.s32 %f8, %r2; |
|
tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [%rd3, {%f1, %f8}]; |
|
add.s32 %r147, %r2, -2; |
|
cvt.rn.f32.s32 %f9, %r147; |
|
tex.2d.v4.u32.f32 {%r23, %r24, %r25, %r26}, [%rd3, {%f1, %f9}]; |
|
add.s32 %r148, %r2, -4; |
|
cvt.rn.f32.s32 %f10, %r148; |
|
tex.2d.v4.u32.f32 {%r27, %r28, %r29, %r30}, [%rd3, {%f1, %f10}]; |
|
tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [%rd8, {%f1, %f2}]; |
|
tex.2d.v4.u32.f32 {%r35, %r36, %r37, %r38}, [%rd8, {%f1, %f3}]; |
|
tex.2d.v4.u32.f32 {%r39, %r40, %r41, %r42}, [%rd9, {%f1, %f2}]; |
|
tex.2d.v4.u32.f32 {%r43, %r44, %r45, %r46}, [%rd9, {%f1, %f3}]; |
|
tex.2d.v4.u32.f32 {%r47, %r48, %r49, %r50}, [%rd5, {%f1, %f7}]; |
|
tex.2d.v4.u32.f32 {%r51, %r52, %r53, %r54}, [%rd5, {%f1, %f8}]; |
|
tex.2d.v4.u32.f32 {%r55, %r56, %r57, %r58}, [%rd5, {%f1, %f9}]; |
|
tex.2d.v4.u32.f32 {%r59, %r60, %r61, %r62}, [%rd5, {%f1, %f10}]; |
|
and.b32 %r149, %r19, 255; |
|
and.b32 %r150, %r51, 255; |
|
add.s32 %r63, %r150, %r149; |
|
shr.u32 %r280, %r63, 1; |
|
sub.s32 %r151, %r149, %r150; |
|
abs.s32 %r65, %r151; |
|
and.b32 %r152, %r35, 255; |
|
sub.s32 %r153, %r152, %r9; |
|
abs.s32 %r154, %r153; |
|
and.b32 %r155, %r31, 255; |
|
sub.s32 %r156, %r155, %r10; |
|
abs.s32 %r157, %r156; |
|
add.s32 %r158, %r154, %r157; |
|
shr.u32 %r159, %r158, 1; |
|
and.b32 %r160, %r43, 255; |
|
sub.s32 %r161, %r160, %r9; |
|
abs.s32 %r162, %r161; |
|
and.b32 %r163, %r39, 255; |
|
sub.s32 %r164, %r163, %r10; |
|
abs.s32 %r165, %r164; |
|
add.s32 %r166, %r162, %r165; |
|
shr.u32 %r167, %r166, 1; |
|
shr.u32 %r168, %r65, 1; |
|
max.u32 %r169, %r168, %r159; |
|
max.s32 %r66, %r169, %r167; |
|
setp.eq.s32 %p9, %r66, 0; |
|
@%p9 bra $L__BB2_9; |
|
|
|
and.b32 %r170, %r23, 255; |
|
and.b32 %r171, %r55, 255; |
|
add.s32 %r67, %r171, %r170; |
|
shr.u32 %r172, %r67, 1; |
|
sub.s32 %r173, %r172, %r9; |
|
and.b32 %r174, %r15, 255; |
|
and.b32 %r175, %r47, 255; |
|
add.s32 %r68, %r175, %r174; |
|
shr.u32 %r176, %r68, 1; |
|
sub.s32 %r177, %r176, %r10; |
|
min.s32 %r178, %r173, %r177; |
|
sub.s32 %r179, %r280, %r10; |
|
sub.s32 %r180, %r280, %r9; |
|
max.s32 %r181, %r179, %r180; |
|
max.s32 %r182, %r181, %r178; |
|
max.s32 %r183, %r173, %r177; |
|
min.s32 %r184, %r179, %r180; |
|
min.s32 %r185, %r184, %r183; |
|
neg.s32 %r186, %r182; |
|
max.s32 %r187, %r66, %r185; |
|
max.s32 %r69, %r187, %r186; |
|
sub.s32 %r188, %r9, %r10; |
|
abs.s32 %r189, %r188; |
|
setp.gt.s32 %p10, %r189, %r65; |
|
add.s32 %r70, %r9, %r10; |
|
and.b32 %r190, %r3, 255; |
|
and.b32 %r191, %r7, 255; |
|
add.s32 %r71, %r191, %r190; |
|
@%p10 bra $L__BB2_7; |
|
bra.uni $L__BB2_6; |
|
|
|
$L__BB2_7: |
|
mul.lo.s32 %r193, %r63, 5570; |
|
add.s32 %r194, %r67, %r68; |
|
mad.lo.s32 %r195, %r194, -3801, %r193; |
|
and.b32 %r196, %r11, 255; |
|
and.b32 %r197, %r27, 255; |
|
add.s32 %r198, %r197, %r196; |
|
and.b32 %r199, %r59, 255; |
|
add.s32 %r200, %r198, %r199; |
|
add.s32 %r201, %r200, %r199; |
|
mad.lo.s32 %r202, %r201, 1016, %r195; |
|
shr.s32 %r203, %r202, 2; |
|
mul.lo.s32 %r204, %r70, 4309; |
|
mad.lo.s32 %r205, %r71, -213, %r204; |
|
add.s32 %r279, %r205, %r203; |
|
bra.uni $L__BB2_8; |
|
|
|
$L__BB2_6: |
|
mul.lo.s32 %r192, %r70, 5077; |
|
mad.lo.s32 %r279, %r71, -981, %r192; |
|
|
|
$L__BB2_8: |
|
add.s32 %r206, %r69, %r280; |
|
shr.s32 %r207, %r279, 13; |
|
setp.gt.s32 %p11, %r207, %r206; |
|
sub.s32 %r208, %r280, %r69; |
|
max.s32 %r209, %r207, %r208; |
|
selp.b32 %r210, %r206, %r209, %p11; |
|
setp.lt.s32 %p12, %r210, 0; |
|
min.s32 %r211, %r210, %r97; |
|
selp.b32 %r280, 0, %r211, %p12; |
|
|
|
$L__BB2_9: |
|
and.b32 %r212, %r52, 255; |
|
and.b32 %r213, %r20, 255; |
|
add.s32 %r77, %r212, %r213; |
|
shr.u32 %r282, %r77, 1; |
|
sub.s32 %r214, %r213, %r212; |
|
abs.s32 %r79, %r214; |
|
and.b32 %r215, %r36, 255; |
|
and.b32 %r80, %r6, 255; |
|
sub.s32 %r216, %r215, %r80; |
|
abs.s32 %r217, %r216; |
|
and.b32 %r218, %r32, 255; |
|
and.b32 %r81, %r5, 255; |
|
sub.s32 %r219, %r218, %r81; |
|
abs.s32 %r220, %r219; |
|
add.s32 %r221, %r217, %r220; |
|
shr.u32 %r222, %r221, 1; |
|
and.b32 %r223, %r44, 255; |
|
sub.s32 %r224, %r223, %r80; |
|
abs.s32 %r225, %r224; |
|
and.b32 %r226, %r40, 255; |
|
sub.s32 %r227, %r226, %r81; |
|
abs.s32 %r228, %r227; |
|
add.s32 %r229, %r225, %r228; |
|
shr.u32 %r230, %r229, 1; |
|
shr.u32 %r231, %r79, 1; |
|
max.u32 %r232, %r231, %r222; |
|
max.s32 %r82, %r232, %r230; |
|
setp.eq.s32 %p13, %r82, 0; |
|
@%p13 bra $L__BB2_14; |
|
|
|
and.b32 %r233, %r24, 255; |
|
and.b32 %r234, %r56, 255; |
|
add.s32 %r83, %r234, %r233; |
|
shr.u32 %r235, %r83, 1; |
|
sub.s32 %r236, %r235, %r80; |
|
and.b32 %r237, %r48, 255; |
|
and.b32 %r238, %r16, 255; |
|
add.s32 %r84, %r237, %r238; |
|
shr.u32 %r239, %r84, 1; |
|
sub.s32 %r240, %r239, %r81; |
|
min.s32 %r241, %r236, %r240; |
|
sub.s32 %r242, %r282, %r81; |
|
sub.s32 %r243, %r282, %r80; |
|
max.s32 %r244, %r242, %r243; |
|
max.s32 %r245, %r244, %r241; |
|
max.s32 %r246, %r236, %r240; |
|
min.s32 %r247, %r242, %r243; |
|
min.s32 %r248, %r247, %r246; |
|
neg.s32 %r249, %r245; |
|
max.s32 %r250, %r82, %r248; |
|
max.s32 %r85, %r250, %r249; |
|
sub.s32 %r251, %r80, %r81; |
|
abs.s32 %r252, %r251; |
|
setp.gt.s32 %p14, %r252, %r79; |
|
add.s32 %r86, %r80, %r81; |
|
and.b32 %r253, %r4, 255; |
|
and.b32 %r254, %r8, 255; |
|
add.s32 %r87, %r254, %r253; |
|
@%p14 bra $L__BB2_12; |
|
bra.uni $L__BB2_11; |
|
|
|
$L__BB2_12: |
|
mul.lo.s32 %r256, %r77, 5570; |
|
add.s32 %r257, %r83, %r84; |
|
mad.lo.s32 %r258, %r257, -3801, %r256; |
|
and.b32 %r259, %r12, 255; |
|
and.b32 %r260, %r28, 255; |
|
add.s32 %r261, %r260, %r259; |
|
and.b32 %r262, %r60, 255; |
|
add.s32 %r263, %r261, %r262; |
|
add.s32 %r264, %r263, %r262; |
|
mad.lo.s32 %r265, %r264, 1016, %r258; |
|
shr.s32 %r266, %r265, 2; |
|
mul.lo.s32 %r267, %r86, 4309; |
|
mad.lo.s32 %r268, %r87, -213, %r267; |
|
add.s32 %r281, %r268, %r266; |
|
bra.uni $L__BB2_13; |
|
|
|
$L__BB2_11: |
|
mul.lo.s32 %r255, %r86, 5077; |
|
mad.lo.s32 %r281, %r87, -981, %r255; |
|
|
|
$L__BB2_13: |
|
add.s32 %r269, %r85, %r282; |
|
shr.s32 %r270, %r281, 13; |
|
setp.gt.s32 %p15, %r270, %r269; |
|
sub.s32 %r271, %r282, %r85; |
|
max.s32 %r272, %r270, %r271; |
|
selp.b32 %r273, %r269, %r272, %p15; |
|
setp.lt.s32 %p16, %r273, 0; |
|
min.s32 %r274, %r273, %r97; |
|
selp.b32 %r282, 0, %r274, %p16; |
|
|
|
$L__BB2_14: |
|
cvt.u16.u32 %rs3, %r282; |
|
cvt.u16.u32 %rs4, %r280; |
|
st.global.v2.u8 [%rd1], {%rs4, %rs3}; |
|
|
|
$L__BB2_16: |
|
ret; |
|
|
|
} |
|
|
|
.visible .entry bwdif_ushort2( |
|
.param .u64 bwdif_ushort2_param_0, |
|
.param .u64 bwdif_ushort2_param_1, |
|
.param .u64 bwdif_ushort2_param_2, |
|
.param .u64 bwdif_ushort2_param_3, |
|
.param .u32 bwdif_ushort2_param_4, |
|
.param .u32 bwdif_ushort2_param_5, |
|
.param .u32 bwdif_ushort2_param_6, |
|
.param .u32 bwdif_ushort2_param_7, |
|
.param .u32 bwdif_ushort2_param_8, |
|
.param .u32 bwdif_ushort2_param_9, |
|
.param .u32 bwdif_ushort2_param_10, |
|
.param .u32 bwdif_ushort2_param_11, |
|
.param .u32 bwdif_ushort2_param_12 |
|
) |
|
{ |
|
.reg .pred %p<17>; |
|
.reg .b16 %rs<7>; |
|
.reg .f32 %f<12>; |
|
.reg .b32 %r<283>; |
|
.reg .b64 %rd<10>; |
|
|
|
|
|
ld.param.u64 %rd2, [bwdif_ushort2_param_0]; |
|
ld.param.u64 %rd3, [bwdif_ushort2_param_1]; |
|
ld.param.u64 %rd4, [bwdif_ushort2_param_2]; |
|
ld.param.u64 %rd5, [bwdif_ushort2_param_3]; |
|
ld.param.u32 %r98, [bwdif_ushort2_param_4]; |
|
ld.param.u32 %r99, [bwdif_ushort2_param_5]; |
|
ld.param.u32 %r93, [bwdif_ushort2_param_6]; |
|
ld.param.u32 %r94, [bwdif_ushort2_param_9]; |
|
ld.param.u32 %r95, [bwdif_ushort2_param_10]; |
|
ld.param.u32 %r96, [bwdif_ushort2_param_11]; |
|
ld.param.u32 %r97, [bwdif_ushort2_param_12]; |
|
mov.u32 %r100, %ctaid.x; |
|
mov.u32 %r101, %ntid.x; |
|
mov.u32 %r102, %tid.x; |
|
mad.lo.s32 %r1, %r100, %r101, %r102; |
|
mov.u32 %r103, %ntid.y; |
|
mov.u32 %r104, %ctaid.y; |
|
mov.u32 %r105, %tid.y; |
|
mad.lo.s32 %r2, %r104, %r103, %r105; |
|
setp.ge.s32 %p1, %r1, %r98; |
|
setp.ge.s32 %p2, %r2, %r99; |
|
or.pred %p3, %p1, %p2; |
|
@%p3 bra $L__BB3_16; |
|
|
|
cvta.to.global.u64 %rd6, %rd2; |
|
shr.u32 %r106, %r2, 31; |
|
add.s32 %r107, %r2, %r106; |
|
and.b32 %r108, %r107, -2; |
|
sub.s32 %r109, %r2, %r108; |
|
setp.eq.s32 %p4, %r109, %r94; |
|
mad.lo.s32 %r110, %r2, %r93, %r1; |
|
mul.wide.s32 %rd7, %r110, 4; |
|
add.s64 %rd1, %rd6, %rd7; |
|
cvt.rn.f32.s32 %f1, %r1; |
|
@%p4 bra $L__BB3_15; |
|
bra.uni $L__BB3_2; |
|
|
|
$L__BB3_15: |
|
cvt.rn.f32.s32 %f11, %r2; |
|
tex.2d.v4.u32.f32 {%r275, %r276, %r277, %r278}, [%rd4, {%f1, %f11}]; |
|
cvt.u16.u32 %rs5, %r276; |
|
cvt.u16.u32 %rs6, %r275; |
|
st.global.v2.u16 [%rd1], {%rs6, %rs5}; |
|
bra.uni $L__BB3_16; |
|
|
|
$L__BB3_2: |
|
add.s32 %r111, %r2, 3; |
|
cvt.rn.f32.s32 %f4, %r111; |
|
tex.2d.v4.u32.f32 {%r3, %r4, %r112, %r113}, [%rd4, {%f1, %f4}]; |
|
add.s32 %r114, %r2, 1; |
|
cvt.rn.f32.s32 %f2, %r114; |
|
tex.2d.v4.u32.f32 {%r115, %r5, %r116, %r117}, [%rd4, {%f1, %f2}]; |
|
add.s32 %r118, %r2, -1; |
|
cvt.rn.f32.s32 %f3, %r118; |
|
tex.2d.v4.u32.f32 {%r119, %r6, %r120, %r121}, [%rd4, {%f1, %f3}]; |
|
add.s32 %r122, %r2, -3; |
|
cvt.rn.f32.s32 %f5, %r122; |
|
tex.2d.v4.u32.f32 {%r7, %r8, %r123, %r124}, [%rd4, {%f1, %f5}]; |
|
and.b32 %r9, %r119, 65535; |
|
and.b32 %r10, %r115, 65535; |
|
setp.eq.s32 %p5, %r96, 0; |
|
@%p5 bra $L__BB3_4; |
|
|
|
add.s32 %r125, %r9, %r10; |
|
mul.lo.s32 %r126, %r125, 5077; |
|
and.b32 %r127, %r3, 65535; |
|
and.b32 %r128, %r7, 65535; |
|
add.s32 %r129, %r128, %r127; |
|
mad.lo.s32 %r130, %r129, -981, %r126; |
|
shr.s32 %r131, %r130, 13; |
|
setp.lt.s32 %p6, %r130, 0; |
|
min.s32 %r132, %r131, %r97; |
|
selp.b32 %r133, 0, %r132, %p6; |
|
and.b32 %r134, %r5, 65535; |
|
and.b32 %r135, %r6, 65535; |
|
add.s32 %r136, %r135, %r134; |
|
mul.lo.s32 %r137, %r136, 5077; |
|
and.b32 %r138, %r4, 65535; |
|
and.b32 %r139, %r8, 65535; |
|
add.s32 %r140, %r139, %r138; |
|
mad.lo.s32 %r141, %r140, -981, %r137; |
|
shr.s32 %r142, %r141, 13; |
|
setp.lt.s32 %p7, %r141, 0; |
|
min.s32 %r143, %r142, %r97; |
|
selp.b32 %r144, 0, %r143, %p7; |
|
cvt.u16.u32 %rs1, %r133; |
|
cvt.u16.u32 %rs2, %r144; |
|
st.global.v2.u16 [%rd1], {%rs1, %rs2}; |
|
bra.uni $L__BB3_16; |
|
|
|
$L__BB3_4: |
|
setp.eq.s32 %p8, %r94, %r95; |
|
selp.b64 %rd8, %rd4, %rd3, %p8; |
|
selp.b64 %rd9, %rd5, %rd4, %p8; |
|
add.s32 %r145, %r2, 4; |
|
cvt.rn.f32.s32 %f6, %r145; |
|
tex.2d.v4.u32.f32 {%r11, %r12, %r13, %r14}, [%rd3, {%f1, %f6}]; |
|
add.s32 %r146, %r2, 2; |
|
cvt.rn.f32.s32 %f7, %r146; |
|
tex.2d.v4.u32.f32 {%r15, %r16, %r17, %r18}, [%rd3, {%f1, %f7}]; |
|
cvt.rn.f32.s32 %f8, %r2; |
|
tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [%rd3, {%f1, %f8}]; |
|
add.s32 %r147, %r2, -2; |
|
cvt.rn.f32.s32 %f9, %r147; |
|
tex.2d.v4.u32.f32 {%r23, %r24, %r25, %r26}, [%rd3, {%f1, %f9}]; |
|
add.s32 %r148, %r2, -4; |
|
cvt.rn.f32.s32 %f10, %r148; |
|
tex.2d.v4.u32.f32 {%r27, %r28, %r29, %r30}, [%rd3, {%f1, %f10}]; |
|
tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [%rd8, {%f1, %f2}]; |
|
tex.2d.v4.u32.f32 {%r35, %r36, %r37, %r38}, [%rd8, {%f1, %f3}]; |
|
tex.2d.v4.u32.f32 {%r39, %r40, %r41, %r42}, [%rd9, {%f1, %f2}]; |
|
tex.2d.v4.u32.f32 {%r43, %r44, %r45, %r46}, [%rd9, {%f1, %f3}]; |
|
tex.2d.v4.u32.f32 {%r47, %r48, %r49, %r50}, [%rd5, {%f1, %f7}]; |
|
tex.2d.v4.u32.f32 {%r51, %r52, %r53, %r54}, [%rd5, {%f1, %f8}]; |
|
tex.2d.v4.u32.f32 {%r55, %r56, %r57, %r58}, [%rd5, {%f1, %f9}]; |
|
tex.2d.v4.u32.f32 {%r59, %r60, %r61, %r62}, [%rd5, {%f1, %f10}]; |
|
and.b32 %r149, %r19, 65535; |
|
and.b32 %r150, %r51, 65535; |
|
add.s32 %r63, %r150, %r149; |
|
shr.u32 %r280, %r63, 1; |
|
sub.s32 %r151, %r149, %r150; |
|
abs.s32 %r65, %r151; |
|
and.b32 %r152, %r35, 65535; |
|
sub.s32 %r153, %r152, %r9; |
|
abs.s32 %r154, %r153; |
|
and.b32 %r155, %r31, 65535; |
|
sub.s32 %r156, %r155, %r10; |
|
abs.s32 %r157, %r156; |
|
add.s32 %r158, %r154, %r157; |
|
shr.u32 %r159, %r158, 1; |
|
and.b32 %r160, %r43, 65535; |
|
sub.s32 %r161, %r160, %r9; |
|
abs.s32 %r162, %r161; |
|
and.b32 %r163, %r39, 65535; |
|
sub.s32 %r164, %r163, %r10; |
|
abs.s32 %r165, %r164; |
|
add.s32 %r166, %r162, %r165; |
|
shr.u32 %r167, %r166, 1; |
|
shr.u32 %r168, %r65, 1; |
|
max.u32 %r169, %r168, %r159; |
|
max.s32 %r66, %r169, %r167; |
|
setp.eq.s32 %p9, %r66, 0; |
|
@%p9 bra $L__BB3_9; |
|
|
|
and.b32 %r170, %r23, 65535; |
|
and.b32 %r171, %r55, 65535; |
|
add.s32 %r67, %r171, %r170; |
|
shr.u32 %r172, %r67, 1; |
|
sub.s32 %r173, %r172, %r9; |
|
and.b32 %r174, %r15, 65535; |
|
and.b32 %r175, %r47, 65535; |
|
add.s32 %r68, %r175, %r174; |
|
shr.u32 %r176, %r68, 1; |
|
sub.s32 %r177, %r176, %r10; |
|
min.s32 %r178, %r173, %r177; |
|
sub.s32 %r179, %r280, %r10; |
|
sub.s32 %r180, %r280, %r9; |
|
max.s32 %r181, %r179, %r180; |
|
max.s32 %r182, %r181, %r178; |
|
max.s32 %r183, %r173, %r177; |
|
min.s32 %r184, %r179, %r180; |
|
min.s32 %r185, %r184, %r183; |
|
neg.s32 %r186, %r182; |
|
max.s32 %r187, %r66, %r185; |
|
max.s32 %r69, %r187, %r186; |
|
sub.s32 %r188, %r9, %r10; |
|
abs.s32 %r189, %r188; |
|
setp.gt.s32 %p10, %r189, %r65; |
|
add.s32 %r70, %r9, %r10; |
|
and.b32 %r190, %r3, 65535; |
|
and.b32 %r191, %r7, 65535; |
|
add.s32 %r71, %r191, %r190; |
|
@%p10 bra $L__BB3_7; |
|
bra.uni $L__BB3_6; |
|
|
|
$L__BB3_7: |
|
mul.lo.s32 %r193, %r63, 5570; |
|
add.s32 %r194, %r67, %r68; |
|
mad.lo.s32 %r195, %r194, -3801, %r193; |
|
and.b32 %r196, %r11, 65535; |
|
and.b32 %r197, %r27, 65535; |
|
add.s32 %r198, %r197, %r196; |
|
and.b32 %r199, %r59, 65535; |
|
add.s32 %r200, %r198, %r199; |
|
add.s32 %r201, %r200, %r199; |
|
mad.lo.s32 %r202, %r201, 1016, %r195; |
|
shr.s32 %r203, %r202, 2; |
|
mul.lo.s32 %r204, %r70, 4309; |
|
mad.lo.s32 %r205, %r71, -213, %r204; |
|
add.s32 %r279, %r205, %r203; |
|
bra.uni $L__BB3_8; |
|
|
|
$L__BB3_6: |
|
mul.lo.s32 %r192, %r70, 5077; |
|
mad.lo.s32 %r279, %r71, -981, %r192; |
|
|
|
$L__BB3_8: |
|
add.s32 %r206, %r69, %r280; |
|
shr.s32 %r207, %r279, 13; |
|
setp.gt.s32 %p11, %r207, %r206; |
|
sub.s32 %r208, %r280, %r69; |
|
max.s32 %r209, %r207, %r208; |
|
selp.b32 %r210, %r206, %r209, %p11; |
|
setp.lt.s32 %p12, %r210, 0; |
|
min.s32 %r211, %r210, %r97; |
|
selp.b32 %r280, 0, %r211, %p12; |
|
|
|
$L__BB3_9: |
|
and.b32 %r212, %r52, 65535; |
|
and.b32 %r213, %r20, 65535; |
|
add.s32 %r77, %r212, %r213; |
|
shr.u32 %r282, %r77, 1; |
|
sub.s32 %r214, %r213, %r212; |
|
abs.s32 %r79, %r214; |
|
and.b32 %r215, %r36, 65535; |
|
and.b32 %r80, %r6, 65535; |
|
sub.s32 %r216, %r215, %r80; |
|
abs.s32 %r217, %r216; |
|
and.b32 %r218, %r32, 65535; |
|
and.b32 %r81, %r5, 65535; |
|
sub.s32 %r219, %r218, %r81; |
|
abs.s32 %r220, %r219; |
|
add.s32 %r221, %r217, %r220; |
|
shr.u32 %r222, %r221, 1; |
|
and.b32 %r223, %r44, 65535; |
|
sub.s32 %r224, %r223, %r80; |
|
abs.s32 %r225, %r224; |
|
and.b32 %r226, %r40, 65535; |
|
sub.s32 %r227, %r226, %r81; |
|
abs.s32 %r228, %r227; |
|
add.s32 %r229, %r225, %r228; |
|
shr.u32 %r230, %r229, 1; |
|
shr.u32 %r231, %r79, 1; |
|
max.u32 %r232, %r231, %r222; |
|
max.s32 %r82, %r232, %r230; |
|
setp.eq.s32 %p13, %r82, 0; |
|
@%p13 bra $L__BB3_14; |
|
|
|
and.b32 %r233, %r24, 65535; |
|
and.b32 %r234, %r56, 65535; |
|
add.s32 %r83, %r234, %r233; |
|
shr.u32 %r235, %r83, 1; |
|
sub.s32 %r236, %r235, %r80; |
|
and.b32 %r237, %r48, 65535; |
|
and.b32 %r238, %r16, 65535; |
|
add.s32 %r84, %r237, %r238; |
|
shr.u32 %r239, %r84, 1; |
|
sub.s32 %r240, %r239, %r81; |
|
min.s32 %r241, %r236, %r240; |
|
sub.s32 %r242, %r282, %r81; |
|
sub.s32 %r243, %r282, %r80; |
|
max.s32 %r244, %r242, %r243; |
|
max.s32 %r245, %r244, %r241; |
|
max.s32 %r246, %r236, %r240; |
|
min.s32 %r247, %r242, %r243; |
|
min.s32 %r248, %r247, %r246; |
|
neg.s32 %r249, %r245; |
|
max.s32 %r250, %r82, %r248; |
|
max.s32 %r85, %r250, %r249; |
|
sub.s32 %r251, %r80, %r81; |
|
abs.s32 %r252, %r251; |
|
setp.gt.s32 %p14, %r252, %r79; |
|
add.s32 %r86, %r80, %r81; |
|
and.b32 %r253, %r4, 65535; |
|
and.b32 %r254, %r8, 65535; |
|
add.s32 %r87, %r254, %r253; |
|
@%p14 bra $L__BB3_12; |
|
bra.uni $L__BB3_11; |
|
|
|
$L__BB3_12: |
|
mul.lo.s32 %r256, %r77, 5570; |
|
add.s32 %r257, %r83, %r84; |
|
mad.lo.s32 %r258, %r257, -3801, %r256; |
|
and.b32 %r259, %r12, 65535; |
|
and.b32 %r260, %r28, 65535; |
|
add.s32 %r261, %r260, %r259; |
|
and.b32 %r262, %r60, 65535; |
|
add.s32 %r263, %r261, %r262; |
|
add.s32 %r264, %r263, %r262; |
|
mad.lo.s32 %r265, %r264, 1016, %r258; |
|
shr.s32 %r266, %r265, 2; |
|
mul.lo.s32 %r267, %r86, 4309; |
|
mad.lo.s32 %r268, %r87, -213, %r267; |
|
add.s32 %r281, %r268, %r266; |
|
bra.uni $L__BB3_13; |
|
|
|
$L__BB3_11: |
|
mul.lo.s32 %r255, %r86, 5077; |
|
mad.lo.s32 %r281, %r87, -981, %r255; |
|
|
|
$L__BB3_13: |
|
add.s32 %r269, %r85, %r282; |
|
shr.s32 %r270, %r281, 13; |
|
setp.gt.s32 %p15, %r270, %r269; |
|
sub.s32 %r271, %r282, %r85; |
|
max.s32 %r272, %r270, %r271; |
|
selp.b32 %r273, %r269, %r272, %p15; |
|
setp.lt.s32 %p16, %r273, 0; |
|
min.s32 %r274, %r273, %r97; |
|
selp.b32 %r282, 0, %r274, %p16; |
|
|
|
$L__BB3_14: |
|
cvt.u16.u32 %rs3, %r282; |
|
cvt.u16.u32 %rs4, %r280; |
|
st.global.v2.u16 [%rd1], {%rs4, %rs3}; |
|
|
|
$L__BB3_16: |
|
ret; |
|
|
|
} |
|
|
|
|