|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%include "libavutil/x86/x86util.asm" |
|
|
|
SECTION_RODATA |
|
|
|
pw_511: times 16 dw 511 |
|
pw_2047: times 16 dw 2047 |
|
pw_16384: times 16 dw 16384 |
|
pw_m512: times 16 dw -512 |
|
pw_m2048: times 16 dw -2048 |
|
|
|
cextern pw_1 |
|
cextern pw_3 |
|
cextern pw_4 |
|
cextern pw_8 |
|
cextern pw_16 |
|
cextern pw_256 |
|
cextern pw_1023 |
|
cextern pw_4095 |
|
cextern pw_m1 |
|
|
|
SECTION .text |
|
|
|
%macro SCRATCH 3-4 |
|
%if ARCH_X86_64 |
|
SWAP %1, %2 |
|
%if %0 == 4 |
|
%define reg_%4 m%2 |
|
%endif |
|
%else |
|
mova [%3], m%1 |
|
%if %0 == 4 |
|
%define reg_%4 [%3] |
|
%endif |
|
%endif |
|
%endmacro |
|
|
|
%macro UNSCRATCH 3-4 |
|
%if ARCH_X86_64 |
|
SWAP %1, %2 |
|
%else |
|
mova m%1, [%3] |
|
%endif |
|
%if %0 == 4 |
|
%undef reg_%4 |
|
%endif |
|
%endmacro |
|
|
|
%macro PRELOAD 2-3 |
|
%if ARCH_X86_64 |
|
mova m%1, [%2] |
|
%if %0 == 3 |
|
%define reg_%3 m%1 |
|
%endif |
|
%elif %0 == 3 |
|
%define reg_%3 [%2] |
|
%endif |
|
%endmacro |
|
|
|
|
|
%macro FLAT8OUT_HALF 0 |
|
psubw m4, m0 |
|
psubw m5, m0 |
|
psubw m6, m0 |
|
psubw m7, m0 |
|
ABS2 m4, m5, m2, m3 |
|
ABS2 m6, m7, m2, m3 |
|
pcmpgtw m4, reg_F |
|
pcmpgtw m5, reg_F |
|
pcmpgtw m6, reg_F |
|
pcmpgtw m7, reg_F |
|
por m5, m4 |
|
por m7, m6 |
|
por m7, m5 |
|
%endmacro |
|
|
|
|
|
%macro FLAT8IN_HALF 1 |
|
%if %1 > 4 |
|
psubw m4, m3, m0 |
|
psubw m5, m2, m0 |
|
ABS2 m4, m5, m6, m7 |
|
pcmpgtw m4, reg_F |
|
pcmpgtw m5, reg_F |
|
%endif |
|
psubw m3, m2 |
|
psubw m2, m1 |
|
ABS2 m3, m2, m6, m7 |
|
pcmpgtw m3, reg_I |
|
pcmpgtw m2, reg_I |
|
%if %1 > 4 |
|
por m4, m5 |
|
%endif |
|
por m2, m3 |
|
psubw m3, m1, m0 |
|
ABS1 m3, m5 |
|
%if %1 > 4 |
|
pcmpgtw m6, m3, reg_F |
|
%endif |
|
pcmpgtw m7, m3, reg_H |
|
pcmpgtw m3, reg_I |
|
%if %1 > 4 |
|
por m4, m6 |
|
%endif |
|
por m2, m3 |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro FILTER_STEP 6-10 "", "", "", 0 |
|
|
|
psrlw %1, %2, %4 |
|
psubw %1, %6 |
|
%ifnidn %7, "" |
|
psubw %2, %6 |
|
psubw %2, %7 |
|
paddw %2, %8 |
|
paddw %2, %9 |
|
%endif |
|
pand %1, reg_%3 |
|
%if %10 == 1 |
|
paddw %6, %1 |
|
%else |
|
paddw %1, %6 |
|
mova [%5], %1 |
|
%endif |
|
%endmacro |
|
|
|
|
|
|
|
%macro LOOP_FILTER 3 |
|
|
|
%if ARCH_X86_64 |
|
%if %2 == 16 |
|
%assign %%num_xmm_regs 16 |
|
%elif %2 == 8 |
|
%assign %%num_xmm_regs 15 |
|
%else |
|
%assign %%num_xmm_regs 14 |
|
%endif |
|
%assign %%bak_mem 0 |
|
%else |
|
%assign %%num_xmm_regs 8 |
|
%if %2 == 16 |
|
%assign %%bak_mem 7 |
|
%elif %2 == 8 |
|
%assign %%bak_mem 6 |
|
%else |
|
%assign %%bak_mem 5 |
|
%endif |
|
%endif |
|
|
|
%if %2 == 16 |
|
%ifidn %1, v |
|
%assign %%num_gpr_regs 6 |
|
%else |
|
%assign %%num_gpr_regs 5 |
|
%endif |
|
%assign %%wd_mem 6 |
|
%else |
|
%assign %%num_gpr_regs 5 |
|
%if ARCH_X86_32 && %2 == 8 |
|
%assign %%wd_mem 2 |
|
%else |
|
%assign %%wd_mem 0 |
|
%endif |
|
%endif |
|
|
|
%ifidn %1, v |
|
%assign %%tsp_mem 0 |
|
%elif %2 == 16 |
|
%assign %%tsp_mem 16 |
|
%else |
|
%assign %%tsp_mem 8 |
|
%endif |
|
|
|
%assign %%off %%wd_mem |
|
%assign %%tspoff %%bak_mem+%%wd_mem |
|
%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize) |
|
|
|
%if %3 == 10 |
|
%define %%maxsgn 511 |
|
%define %%minsgn m512 |
|
%define %%maxusgn 1023 |
|
%define %%maxf 4 |
|
%else |
|
%define %%maxsgn 2047 |
|
%define %%minsgn m2048 |
|
%define %%maxusgn 4095 |
|
%define %%maxf 16 |
|
%endif |
|
|
|
cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H |
|
|
|
shl Ed, %3-8 |
|
shl Id, %3-8 |
|
shl Hd, %3-8 |
|
%if cpuflag(ssse3) |
|
mova m0, [pw_256] |
|
%endif |
|
movd m1, Ed |
|
movd m2, Id |
|
movd m3, Hd |
|
%if cpuflag(ssse3) |
|
pshufb m1, m0 |
|
pshufb m2, m0 |
|
pshufb m3, m0 |
|
%else |
|
punpcklwd m1, m1 |
|
punpcklwd m2, m2 |
|
punpcklwd m3, m3 |
|
pshufd m1, m1, q0000 |
|
pshufd m2, m2, q0000 |
|
pshufd m3, m3, q0000 |
|
%endif |
|
SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E |
|
SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I |
|
SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H |
|
%if %2 > 4 |
|
PRELOAD 11, pw_ %+ %%maxf, F |
|
%endif |
|
|
|
|
|
%ifidn %1, v |
|
DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12 |
|
lea stride3q, [strideq*3] |
|
neg strideq |
|
%if %2 == 16 |
|
lea dst0q, [dst8q+strideq*8] |
|
%else |
|
lea dst4q, [dst8q+strideq*4] |
|
%endif |
|
neg strideq |
|
%if %2 == 16 |
|
lea dst12q, [dst8q+strideq*4] |
|
lea dst4q, [dst0q+strideq*4] |
|
%endif |
|
|
|
%if %2 == 16 |
|
%define %%p7 dst0q |
|
%define %%p6 dst0q+strideq |
|
%define %%p5 dst0q+strideq*2 |
|
%define %%p4 dst0q+stride3q |
|
%endif |
|
%define %%p3 dst4q |
|
%define %%p2 dst4q+strideq |
|
%define %%p1 dst4q+strideq*2 |
|
%define %%p0 dst4q+stride3q |
|
%define %%q0 dst8q |
|
%define %%q1 dst8q+strideq |
|
%define %%q2 dst8q+strideq*2 |
|
%define %%q3 dst8q+stride3q |
|
%if %2 == 16 |
|
%define %%q4 dst12q |
|
%define %%q5 dst12q+strideq |
|
%define %%q6 dst12q+strideq*2 |
|
%define %%q7 dst12q+stride3q |
|
%endif |
|
%else |
|
DEFINE_ARGS dst0, stride, stride3, dst4 |
|
lea stride3q, [strideq*3] |
|
lea dst4q, [dst0q+strideq*4] |
|
|
|
%define %%p3 rsp+(%%tspoff+0)*mmsize |
|
%define %%p2 rsp+(%%tspoff+1)*mmsize |
|
%define %%p1 rsp+(%%tspoff+2)*mmsize |
|
%define %%p0 rsp+(%%tspoff+3)*mmsize |
|
%define %%q0 rsp+(%%tspoff+4)*mmsize |
|
%define %%q1 rsp+(%%tspoff+5)*mmsize |
|
%define %%q2 rsp+(%%tspoff+6)*mmsize |
|
%define %%q3 rsp+(%%tspoff+7)*mmsize |
|
|
|
%if %2 < 16 |
|
movu m0, [dst0q+strideq*0-8] |
|
movu m1, [dst0q+strideq*1-8] |
|
movu m2, [dst0q+strideq*2-8] |
|
movu m3, [dst0q+stride3q -8] |
|
movu m4, [dst4q+strideq*0-8] |
|
movu m5, [dst4q+strideq*1-8] |
|
movu m6, [dst4q+strideq*2-8] |
|
movu m7, [dst4q+stride3q -8] |
|
|
|
%if ARCH_X86_64 |
|
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 |
|
%else |
|
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0] |
|
%endif |
|
|
|
mova [%%p3], m0 |
|
mova [%%p2], m1 |
|
mova [%%p1], m2 |
|
mova [%%p0], m3 |
|
%if ARCH_X86_64 |
|
mova [%%q0], m4 |
|
%endif |
|
mova [%%q1], m5 |
|
mova [%%q2], m6 |
|
mova [%%q3], m7 |
|
|
|
|
|
|
|
%else |
|
|
|
%define %%p7 rsp+(%%tspoff+ 8)*mmsize |
|
%define %%p6 rsp+(%%tspoff+ 9)*mmsize |
|
%define %%p5 rsp+(%%tspoff+10)*mmsize |
|
%define %%p4 rsp+(%%tspoff+11)*mmsize |
|
%define %%q4 rsp+(%%tspoff+12)*mmsize |
|
%define %%q5 rsp+(%%tspoff+13)*mmsize |
|
%define %%q6 rsp+(%%tspoff+14)*mmsize |
|
%define %%q7 rsp+(%%tspoff+15)*mmsize |
|
|
|
mova m0, [dst0q+strideq*0-16] |
|
mova m1, [dst0q+strideq*1-16] |
|
mova m2, [dst0q+strideq*2-16] |
|
mova m3, [dst0q+stride3q -16] |
|
mova m4, [dst4q+strideq*0-16] |
|
mova m5, [dst4q+strideq*1-16] |
|
%if ARCH_X86_64 |
|
mova m6, [dst4q+strideq*2-16] |
|
%endif |
|
mova m7, [dst4q+stride3q -16] |
|
|
|
%if ARCH_X86_64 |
|
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 |
|
%else |
|
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1 |
|
%endif |
|
|
|
mova [%%p7], m0 |
|
mova [%%p6], m1 |
|
mova [%%p5], m2 |
|
mova [%%p4], m3 |
|
%if ARCH_X86_64 |
|
mova [%%p3], m4 |
|
%endif |
|
mova [%%p2], m5 |
|
mova [%%p1], m6 |
|
mova [%%p0], m7 |
|
|
|
mova m0, [dst0q+strideq*0] |
|
mova m1, [dst0q+strideq*1] |
|
mova m2, [dst0q+strideq*2] |
|
mova m3, [dst0q+stride3q ] |
|
mova m4, [dst4q+strideq*0] |
|
mova m5, [dst4q+strideq*1] |
|
%if ARCH_X86_64 |
|
mova m6, [dst4q+strideq*2] |
|
%endif |
|
mova m7, [dst4q+stride3q ] |
|
|
|
%if ARCH_X86_64 |
|
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 |
|
%else |
|
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1 |
|
%endif |
|
|
|
mova [%%q0], m0 |
|
mova [%%q1], m1 |
|
mova [%%q2], m2 |
|
mova [%%q3], m3 |
|
%if ARCH_X86_64 |
|
mova [%%q4], m4 |
|
%endif |
|
mova [%%q5], m5 |
|
mova [%%q6], m6 |
|
mova [%%q7], m7 |
|
|
|
|
|
|
|
%endif |
|
%endif |
|
|
|
|
|
mova m0, [%%q0] |
|
%if %2 == 16 |
|
mova m4, [%%q4] |
|
mova m5, [%%q5] |
|
mova m6, [%%q6] |
|
mova m7, [%%q7] |
|
|
|
|
|
FLAT8OUT_HALF |
|
SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O |
|
%endif |
|
|
|
|
|
mova m1, [%%q1] |
|
mova m2, [%%q2] |
|
mova m3, [%%q3] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FLAT8IN_HALF %2 |
|
SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV |
|
%if %2 > 4 |
|
SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I |
|
%endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mova m3, [%%p0] |
|
mova m4, [%%p1] |
|
|
|
|
|
psubw m5, m3, m0 |
|
psubw m6, m4, m1 |
|
%if ARCH_X86_64 |
|
ABS2 m5, m6, m7, m12 |
|
%else |
|
ABS1 m5, m7 |
|
ABS1 m6, m7 |
|
%endif |
|
paddw m5, m5 |
|
psraw m6, 1 |
|
paddw m6, m5 |
|
pcmpgtw m6, reg_E |
|
por m2, m6 |
|
SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SWAP 3, 0 |
|
SWAP 4, 1 |
|
%if %2 == 16 |
|
mova m7, [%%p7] |
|
mova m6, [%%p6] |
|
mova m5, [%%p5] |
|
mova m4, [%%p4] |
|
|
|
|
|
FLAT8OUT_HALF |
|
por m7, reg_F8O |
|
SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O |
|
%endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mova m2, [%%p2] |
|
mova m3, [%%p3] |
|
|
|
|
|
FLAT8IN_HALF %2 |
|
por m7, reg_HEV |
|
%if %2 > 4 |
|
por m4, reg_F8I |
|
%endif |
|
por m2, reg_FM |
|
%if %2 > 4 |
|
por m4, m2 |
|
%if %2 == 16 |
|
por m5, m4, reg_F8O |
|
pandn m2, m4 |
|
pandn m4, m5 |
|
pxor m5, [pw_m1] |
|
SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M |
|
%else |
|
pandn m2, m4 |
|
pxor m4, [pw_m1] |
|
%endif |
|
SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M |
|
%else |
|
pxor m2, [pw_m1] |
|
%endif |
|
SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV |
|
SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%if %2 > 4 |
|
%if %2 == 16 |
|
|
|
mova m2, [%%p7] |
|
mova m3, [%%p6] |
|
mova m6, [%%p5] |
|
mova m7, [%%p4] |
|
PRELOAD 8, %%p3, P3 |
|
PRELOAD 9, %%p2, P2 |
|
%endif |
|
PRELOAD 10, %%q0, Q0 |
|
PRELOAD 11, %%q1, Q1 |
|
%if %2 == 16 |
|
psllw m4, m2, 3 |
|
paddw m5, m3, m3 |
|
paddw m4, m6 |
|
paddw m5, m7 |
|
paddw m4, reg_P3 |
|
paddw m5, reg_P2 |
|
paddw m4, m1 |
|
paddw m5, m0 |
|
paddw m4, reg_Q0 |
|
psubw m5, m2 |
|
paddw m4, [pw_8] |
|
paddw m5, m4 |
|
|
|
|
|
|
|
|
|
mova [rsp+0*mmsize], m3 |
|
FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1 |
|
%endif |
|
mova m3, [%%q2] |
|
%if %2 == 16 |
|
mova [rsp+1*mmsize], m6 |
|
FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3 |
|
%endif |
|
mova m6, [%%q3] |
|
%if %2 == 16 |
|
mova [rsp+2*mmsize], m7 |
|
FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6 |
|
mova m7, [%%q4] |
|
%if ARCH_X86_64 |
|
mova [rsp+3*mmsize], reg_P3 |
|
%else |
|
mova m4, reg_P3 |
|
mova [rsp+3*mmsize], m4 |
|
%endif |
|
FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7 |
|
PRELOAD 8, %%q5, Q5 |
|
%if ARCH_X86_64 |
|
mova [rsp+4*mmsize], reg_P2 |
|
%else |
|
mova m4, reg_P2 |
|
mova [rsp+4*mmsize], m4 |
|
%endif |
|
FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5 |
|
PRELOAD 9, %%q6, Q6 |
|
mova [rsp+5*mmsize], m1 |
|
FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6 |
|
mova m1, [%%q7] |
|
FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1 |
|
FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64 |
|
FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64 |
|
FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1 |
|
FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1 |
|
FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1 |
|
FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1 |
|
FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6 |
|
|
|
mova m7, [%%p1] |
|
%else |
|
SWAP 1, 7 |
|
%endif |
|
|
|
mova m2, [%%p3] |
|
mova m1, [%%p2] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
psllw m4, m2, 2 |
|
paddw m5, m1, m1 |
|
paddw m4, m7 |
|
psubw m5, m2 |
|
paddw m4, m0 |
|
paddw m5, reg_Q0 |
|
paddw m4, [pw_4] |
|
paddw m5, m4 |
|
|
|
%if ARCH_X86_64 |
|
mova m8, m1 |
|
mova m9, m7 |
|
%else |
|
mova [rsp+0*mmsize], m1 |
|
mova [rsp+1*mmsize], m7 |
|
%endif |
|
%ifidn %1, v |
|
FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1 |
|
%else |
|
FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1 |
|
%endif |
|
FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1 |
|
FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1 |
|
%if ARCH_X86_64 |
|
FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64 |
|
FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64 |
|
%else |
|
FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64 |
|
FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64 |
|
%endif |
|
FILTER_STEP m4, m5, F8M, 3, %%q2, m3 |
|
|
|
UNSCRATCH 2, 10, %%q0 |
|
UNSCRATCH 6, 11, %%q1 |
|
%else |
|
SWAP 1, 7 |
|
mova m2, [%%q0] |
|
mova m6, [%%q1] |
|
%endif |
|
UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
psubw m4, m7, m6 |
|
psubw m5, m2, m0 |
|
pand m4, m3 |
|
pminsw m4, [pw_ %+ %%maxsgn] |
|
pmaxsw m4, [pw_ %+ %%minsgn] |
|
paddw m4, m5 |
|
paddw m5, m5 |
|
paddw m4, m5 |
|
pminsw m4, [pw_ %+ %%maxsgn] |
|
pmaxsw m4, [pw_ %+ %%minsgn] |
|
pand m4, reg_F4M |
|
paddw m5, m4, [pw_4] |
|
paddw m4, [pw_3] |
|
pminsw m5, [pw_ %+ %%maxsgn] |
|
pminsw m4, [pw_ %+ %%maxsgn] |
|
psraw m5, 3 |
|
psraw m4, 3 |
|
psubw m2, m5 |
|
paddw m0, m4 |
|
pandn m3, m5 |
|
pxor m4, m4 |
|
mova m5, [pw_ %+ %%maxusgn] |
|
pmaxsw m2, m4 |
|
pmaxsw m0, m4 |
|
pminsw m2, m5 |
|
pminsw m0, m5 |
|
%if cpuflag(ssse3) |
|
pmulhrsw m3, [pw_16384] |
|
%else |
|
paddw m3, [pw_1] |
|
psraw m3, 1 |
|
%endif |
|
paddw m7, m3 |
|
psubw m6, m3 |
|
pmaxsw m7, m4 |
|
pmaxsw m6, m4 |
|
pminsw m7, m5 |
|
pminsw m6, m5 |
|
|
|
|
|
%ifidn %1, v |
|
mova [%%p1], m7 |
|
mova [%%p0], m0 |
|
mova [%%q0], m2 |
|
mova [%%q1], m6 |
|
%else |
|
%if %2 == 4 |
|
TRANSPOSE4x4W 7, 0, 2, 6, 1 |
|
movh [dst0q+strideq*0-4], m7 |
|
movhps [dst0q+strideq*1-4], m7 |
|
movh [dst0q+strideq*2-4], m0 |
|
movhps [dst0q+stride3q -4], m0 |
|
movh [dst4q+strideq*0-4], m2 |
|
movhps [dst4q+strideq*1-4], m2 |
|
movh [dst4q+strideq*2-4], m6 |
|
movhps [dst4q+stride3q -4], m6 |
|
%elif %2 == 8 |
|
mova m3, [%%p3] |
|
mova m4, [%%q2] |
|
mova m5, [%%q3] |
|
|
|
%if ARCH_X86_64 |
|
TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8 |
|
%else |
|
TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1 |
|
mova m2, [%%q0] |
|
%endif |
|
|
|
movu [dst0q+strideq*0-8], m3 |
|
movu [dst0q+strideq*1-8], m1 |
|
movu [dst0q+strideq*2-8], m7 |
|
movu [dst0q+stride3q -8], m0 |
|
movu [dst4q+strideq*0-8], m2 |
|
movu [dst4q+strideq*1-8], m6 |
|
movu [dst4q+strideq*2-8], m4 |
|
movu [dst4q+stride3q -8], m5 |
|
%else |
|
SCRATCH 2, 8, %%q0 |
|
SCRATCH 6, 9, %%q1 |
|
mova m2, [%%p7] |
|
mova m3, [%%p6] |
|
mova m4, [%%p5] |
|
mova m5, [%%p4] |
|
mova m6, [%%p3] |
|
|
|
%if ARCH_X86_64 |
|
TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10 |
|
%else |
|
mova [%%p1], m7 |
|
TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1 |
|
%endif |
|
|
|
mova [dst0q+strideq*0-16], m2 |
|
mova [dst0q+strideq*1-16], m3 |
|
mova [dst0q+strideq*2-16], m4 |
|
mova [dst0q+stride3q -16], m5 |
|
%if ARCH_X86_64 |
|
mova [dst4q+strideq*0-16], m6 |
|
%endif |
|
mova [dst4q+strideq*1-16], m1 |
|
mova [dst4q+strideq*2-16], m7 |
|
mova [dst4q+stride3q -16], m0 |
|
|
|
UNSCRATCH 2, 8, %%q0 |
|
UNSCRATCH 6, 9, %%q1 |
|
mova m0, [%%q2] |
|
mova m1, [%%q3] |
|
mova m3, [%%q4] |
|
mova m4, [%%q5] |
|
%if ARCH_X86_64 |
|
mova m5, [%%q6] |
|
%endif |
|
mova m7, [%%q7] |
|
|
|
%if ARCH_X86_64 |
|
TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8 |
|
%else |
|
TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1 |
|
%endif |
|
|
|
mova [dst0q+strideq*0], m2 |
|
mova [dst0q+strideq*1], m6 |
|
mova [dst0q+strideq*2], m0 |
|
mova [dst0q+stride3q ], m1 |
|
%if ARCH_X86_64 |
|
mova [dst4q+strideq*0], m3 |
|
%endif |
|
mova [dst4q+strideq*1], m4 |
|
mova [dst4q+strideq*2], m5 |
|
mova [dst4q+stride3q ], m7 |
|
%endif |
|
%endif |
|
RET |
|
%endmacro |
|
|
|
%macro LOOP_FILTER_CPUSETS 3 |
|
INIT_XMM sse2 |
|
LOOP_FILTER %1, %2, %3 |
|
INIT_XMM ssse3 |
|
LOOP_FILTER %1, %2, %3 |
|
INIT_XMM avx |
|
LOOP_FILTER %1, %2, %3 |
|
%endmacro |
|
|
|
%macro LOOP_FILTER_WDSETS 2 |
|
LOOP_FILTER_CPUSETS %1, 4, %2 |
|
LOOP_FILTER_CPUSETS %1, 8, %2 |
|
LOOP_FILTER_CPUSETS %1, 16, %2 |
|
%endmacro |
|
|
|
LOOP_FILTER_WDSETS h, 10 |
|
LOOP_FILTER_WDSETS v, 10 |
|
LOOP_FILTER_WDSETS h, 12 |
|
LOOP_FILTER_WDSETS v, 12 |
|
|