|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%include "libavutil/x86/x86util.asm" |
|
|
|
SECTION .data |
|
|
|
gblur_transpose_16x16_indices1: dq 2, 3, 0, 1, 6, 7, 4, 5 |
|
gblur_transpose_16x16_indices2: dq 1, 0, 3, 2, 5, 4, 7, 6 |
|
gblur_transpose_16x16_indices3: dd 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 |
|
gblur_transpose_16x16_mask: dw 0xcc, 0x33, 0xaa, 0x55, 0xaaaa, 0x5555 |
|
gblur_vindex_width: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
|
|
|
SECTION .text |
|
|
|
%xdefine AVX2_MMSIZE 32 |
|
%xdefine AVX512_MMSIZE 64 |
|
|
|
%macro MOVSXDIFNIDN 1-* |
|
%rep %0 |
|
movsxdifnidn %1q, %1d |
|
%rotate 1 |
|
%endrep |
|
%endmacro |
|
|
|
%macro KXNOR 2-* |
|
%if mmsize == AVX512_MMSIZE |
|
kxnorw %2, %2, %2 |
|
%else |
|
%if %0 == 3 |
|
mov %3, -1 |
|
%else |
|
vpcmpeqd %1, %1, %1 |
|
%endif |
|
%endif |
|
%endmacro |
|
|
|
%macro KMOVW 2-4 |
|
%if mmsize == AVX2_MMSIZE && %0 == 4 |
|
mova %1, %2 |
|
%elif mmsize == AVX512_MMSIZE |
|
%if %0 == 4 |
|
%rotate 2 |
|
%endif |
|
kmovw %1, %2 |
|
%endif |
|
%endmacro |
|
|
|
%macro PUSH_MASK 5 |
|
%if mmsize == AVX2_MMSIZE |
|
%assign %%n mmsize/4 |
|
%assign %%i 0 |
|
%rep %%n |
|
mov %4, %3 |
|
and %4, 1 |
|
neg %4 |
|
mov dword [%5 + %%i*4], %4 |
|
sar %3, 1 |
|
%assign %%i %%i+1 |
|
%endrep |
|
movu %1, [%5] |
|
%else |
|
kmovd %2, %3 |
|
%endif |
|
%endmacro |
|
|
|
%macro VMASKMOVPS 4 |
|
%if mmsize == AVX2_MMSIZE |
|
vpmaskmovd %1, %3, %2 |
|
%else |
|
kmovw k7, %4 |
|
vmovups %1{k7}, %2 |
|
%endif |
|
%endmacro |
|
|
|
%macro VGATHERDPS 4 |
|
%if mmsize == AVX2_MMSIZE |
|
vgatherdps %1, %2, %3 |
|
%else |
|
vgatherdps %1{%4}, %2 |
|
%endif |
|
%endmacro |
|
|
|
%macro VSCATTERDPS128 7 |
|
%rep 4 |
|
mov %7, %6 |
|
and %7, 1 |
|
cmp %7, 0 |
|
je %%end_scatter |
|
movss [%2 + %3*%4], xm%1 |
|
vpshufd m%1, m%1, 0x39 |
|
add %3, %5 |
|
sar %6, 1 |
|
%endrep |
|
%%end_scatter: |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro VSCATTERDPS256 7 |
|
mova m15, m%1 |
|
xor %3, %3 |
|
VSCATTERDPS128 15, %2, %3, %4, %5, %6, %7 |
|
vextractf128 xm15, m%1, 1 |
|
VSCATTERDPS128 15, %2, %3, %4, %5, %6, %7 |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
%macro VSCATTERDPS 8-* |
|
%if mmsize == AVX2_MMSIZE |
|
%if %0 == 9 |
|
mov %9, %4 |
|
VSCATTERDPS256 %6, %1, %2, 4, %7, %9, %8 |
|
%else |
|
VSCATTERDPS256 %6, %1, %2, 4, %7, %4, %8 |
|
%endif |
|
%else |
|
vscatterdps [%1 + %3*4]{%5}, m%6 |
|
%endif |
|
%endmacro |
|
|
|
%macro INIT_WORD_MASK 1-* |
|
%assign %%i 0 |
|
%rep %0 |
|
kmovw %1, [gblur_transpose_16x16_mask + %%i * 2] |
|
%assign %%i %%i+1 |
|
%rotate 1 |
|
%endrep |
|
%endmacro |
|
|
|
%macro INIT_INDICES 1-* |
|
%assign %%i 1 |
|
%rep %0 |
|
movu %1, [gblur_transpose_16x16_indices %+ %%i] |
|
%assign %%i %%i+1 |
|
%rotate 1 |
|
%endrep |
|
%endmacro |
|
|
|
%assign stack_offset 0 |
|
%macro PUSH_MM 1 |
|
%if mmsize == AVX2_MMSIZE |
|
movu [rsp + stack_offset], %1 |
|
%assign stack_offset stack_offset+mmsize |
|
%endif |
|
%endmacro |
|
|
|
%macro POP_MM 1 |
|
%if mmsize == AVX2_MMSIZE |
|
%assign stack_offset stack_offset-mmsize |
|
movu %1, [rsp + stack_offset] |
|
%endif |
|
%endmacro |
|
|
|
%macro READ_LOCAL_BUFFER 1 |
|
%if mmsize == AVX512_MMSIZE |
|
%assign %%i 19 |
|
%else |
|
%assign %%i 9 |
|
%endif |
|
%assign %%j %%i-1 |
|
%assign %%k %1-1 |
|
%xdefine %%m m %+ %%i |
|
mova %%m, m3 |
|
FMULADD_PS %%m, %%m, m0, [localbufq + %%k * mmsize], %%m |
|
%assign %%k %%k-1 |
|
%rep %1-1 |
|
%xdefine %%m m %+ %%j |
|
mova %%m, m %+ %%i |
|
FMULADD_PS %%m, %%m, m0, [localbufq + %%k * mmsize], %%m |
|
%assign %%i %%i-1 |
|
%assign %%j %%j-1 |
|
%assign %%k %%k-1 |
|
%endrep |
|
%if mmsize == AVX512_MMSIZE |
|
mova m3, m %+ %%i |
|
%endif |
|
%endmacro |
|
|
|
%macro FMADD_WRITE 4 |
|
FMULADD_PS %1, %1, %2, %3, %1 |
|
mova %4, %1 |
|
%endmacro |
|
|
|
%macro WRITE_LOCAL_BUFFER_INTERNAL 8-16 |
|
%assign %%i 0 |
|
%rep %0 |
|
FMADD_WRITE m3, m0, m %+ %1, [localbufq + %%i * mmsize] |
|
%assign %%i %%i+1 |
|
%rotate 1 |
|
%endrep |
|
%endmacro |
|
|
|
%macro GATHERPS 1 |
|
%if mmsize == AVX512_MMSIZE |
|
%assign %%i 4 |
|
%else |
|
%assign %%i 2 |
|
%endif |
|
movu m %+ %%i, [ptrq] |
|
mov strideq, widthq |
|
%assign %%i %%i+1 |
|
%rep %1-2 |
|
movu m %+ %%i, [ptrq + strideq*4] |
|
add strideq, widthq |
|
%assign %%i %%i+1 |
|
%endrep |
|
movu m %+ %%i, [ptrq + strideq*4] |
|
%endmacro |
|
|
|
%macro SCATTERPS_INTERNAL 8-16 |
|
movu [ptrq + strideq*0], m %+ %1 |
|
mov strideq, widthq |
|
%rotate 1 |
|
%rep %0-2 |
|
movu [ptrq + strideq*4], m %+ %1 |
|
add strideq, widthq |
|
%rotate 1 |
|
%endrep |
|
movu [ptrq + strideq*4], m %+ %1 |
|
%endmacro |
|
|
|
%macro BATCH_INSERT64X4 4-* |
|
%assign %%imm8 %1 |
|
%rotate 1 |
|
%rep (%0-1)/3 |
|
vinserti64x4 m%1, m%2, ym%3, %%imm8 |
|
%rotate 3 |
|
%endrep |
|
%endmacro |
|
|
|
%macro BATCH_EXTRACT_INSERT 2-* |
|
%assign %%imm8 %1 |
|
%rotate 1 |
|
%rep (%0-1)/2 |
|
vextractf64x4 ym%1, m%1, %%imm8 |
|
vextractf64x4 ym%2, m%2, %%imm8 |
|
vinserti64x4 m%1, m%1, ym%2, %%imm8 |
|
%rotate 2 |
|
%endrep |
|
%endmacro |
|
|
|
%macro BATCH_MOVE 2-* |
|
%rep %0/2 |
|
mova m%1, m%2 |
|
%rotate 2 |
|
%endrep |
|
%endmacro |
|
|
|
%macro BATCH_PERMUTE 3-* |
|
%xdefine %%decorator %1 |
|
%xdefine %%mask %2 |
|
%assign %%index %3 |
|
%rotate 3 |
|
%rep (%0-3)/2 |
|
vperm %+ %%decorator m%1{%%mask}, m %+ %%index, m%2 |
|
%rotate 2 |
|
%endrep |
|
%endmacro |
|
|
|
|
|
|
|
%macro TRANSPOSE_16X16_AVX512 0 |
|
BATCH_INSERT64X4 0x1, 20,4,12, 21,5,13, 22,6,14, 23,7,15 |
|
BATCH_INSERT64X4 0x1, 24,8,16, 25,9,17, 26,10,18, 27,11,19 |
|
|
|
BATCH_EXTRACT_INSERT 0x1, 4,12, 5,13, 6,14, 7,15 |
|
BATCH_EXTRACT_INSERT 0x1, 8,16, 9,17, 10,18, 11,19 |
|
|
|
BATCH_MOVE 12,20, 13,21, 14,22, 15,23 |
|
BATCH_PERMUTE q, k6, 28, 12,24, 13,25, 14,26, 15,27 |
|
BATCH_PERMUTE q, k5, 28, 24,20, 25,21, 26,22, 27,23 |
|
|
|
BATCH_MOVE 16,4, 17,5, 18,6, 19,7 |
|
BATCH_PERMUTE q, k6, 28, 16,8, 17,9, 18,10, 19,11 |
|
BATCH_PERMUTE q, k5, 28, 8,4, 9,5, 10,6, 11,7 |
|
|
|
BATCH_MOVE 4,12, 5,13, 6,24, 7,25 |
|
BATCH_MOVE 20,16, 21,17, 22,8, 23,9 |
|
|
|
BATCH_PERMUTE q, k4, 29, 4,14, 5,15, 6,26, 7,27 |
|
BATCH_PERMUTE q, k3, 29, 14,12, 15,13, 26,24, 27,25 |
|
BATCH_PERMUTE q, k4, 29, 20,18, 21,19, 22,10, 23,11 |
|
BATCH_PERMUTE q, k3, 29, 18,16, 19,17, 10,8, 11,9 |
|
|
|
BATCH_MOVE 8,4, 9,14, 16,6, 17,26 |
|
BATCH_MOVE 24,20, 25,18, 12,22, 13,10 |
|
|
|
BATCH_PERMUTE d, k2, 30, 8,5, 9,15, 16,7, 17,27 |
|
BATCH_PERMUTE d, k1, 30, 5,4, 15,14, 7,6, 27,26 |
|
BATCH_PERMUTE d, k2, 30, 24,21, 25,19, 12,23, 13,11 |
|
BATCH_PERMUTE d, k1, 30, 21,20, 19,18, 23,22, 11,10 |
|
%endmacro |
|
|
|
%macro INSERT_UNPACK 8 |
|
vinsertf128 m%5, m%1, xm%3, 0x1 |
|
vinsertf128 m%6, m%2, xm%4, 0x1 |
|
vunpcklpd m%7, m%5, m%6 |
|
vunpckhpd m%8, m%5, m%6 |
|
%endmacro |
|
|
|
%macro SHUFFLE 4 |
|
vshufps m%3, m%1, m%2, 0x88 |
|
vshufps m%4, m%1, m%2, 0xDD |
|
mova m%1, m%3 |
|
mova m%2, m%4 |
|
%endmacro |
|
|
|
%macro EXTRACT_INSERT_UNPACK 6 |
|
vextractf128 xm%1, m%1, 0x1 |
|
vextractf128 xm%2, m%2, 0x1 |
|
vinsertf128 m%3, m%3, xm%1, 0x0 |
|
vinsertf128 m%4, m%4, xm%2, 0x0 |
|
vunpcklpd m%5, m%3, m%4 |
|
vunpckhpd m%6, m%3, m%4 |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro TRANSPOSE_8X8_AVX2 0 |
|
INSERT_UNPACK 2, 3, 6, 7, 10, 11, 12, 13 |
|
INSERT_UNPACK 4, 5, 8, 9, 10, 11, 14, 15 |
|
|
|
SHUFFLE 12, 14, 10, 11 |
|
SHUFFLE 13, 15, 10, 11 |
|
|
|
EXTRACT_INSERT_UNPACK 4, 5, 8, 9, 10, 11 |
|
EXTRACT_INSERT_UNPACK 2, 3, 6, 7, 8, 9 |
|
|
|
SHUFFLE 8, 10, 6, 7 |
|
SHUFFLE 9, 11, 6, 7 |
|
%endmacro |
|
|
|
%macro TRANSPOSE 0 |
|
%if cpuflag(avx512) |
|
TRANSPOSE_16X16_AVX512 |
|
%elif cpuflag(avx2) |
|
TRANSPOSE_8X8_AVX2 |
|
%endif |
|
%endmacro |
|
|
|
%macro WRITE_LOCAL_BUFFER 0 |
|
%if cpuflag(avx512) |
|
WRITE_LOCAL_BUFFER_INTERNAL 8, 5, 9, 15, 16, 7, 17, 27, \ |
|
24, 21, 25, 19, 12, 23, 13, 11 |
|
%elif cpuflag(avx2) |
|
WRITE_LOCAL_BUFFER_INTERNAL 12, 14, 13, 15, 8, 10, 9, 11 |
|
%endif |
|
%endmacro |
|
|
|
%macro SCATTERPS 0 |
|
%if cpuflag(avx512) |
|
SCATTERPS_INTERNAL 8, 5, 9, 15, 16, 7, 17, 27, \ |
|
24, 21, 25, 19, 12, 23, 13, 11 |
|
%elif cpuflag(avx2) |
|
SCATTERPS_INTERNAL 12, 14, 13, 15, 8, 10, 9, 11 |
|
%endif |
|
%endmacro |
|
|
|
%macro OPTIMIZED_LOOP_STEP 0 |
|
lea stepd, [stepsd - 1] |
|
cmp stepd, 0 |
|
jle %%bscale_scalar |
|
%%loop_step: |
|
sub localbufq, mmsize |
|
mulps m3, m1 |
|
movu [localbufq], m3 |
|
|
|
|
|
lea xq, [widthq - 1] |
|
%%loop_step_x_back: |
|
sub localbufq, mmsize |
|
FMULADD_PS m3, m3, m0, [localbufq], m3 |
|
movu [localbufq], m3 |
|
|
|
dec xq |
|
cmp xq, 0 |
|
jg %%loop_step_x_back |
|
|
|
|
|
mulps m3, m1 |
|
movu [localbufq], m3 |
|
add localbufq, mmsize |
|
|
|
lea xq, [widthq - 1] |
|
%%loop_step_x: |
|
FMULADD_PS m3, m3, m0, [localbufq], m3 |
|
movu [localbufq], m3 |
|
add localbufq, mmsize |
|
|
|
dec xq |
|
cmp xq, 0 |
|
jg %%loop_step_x |
|
|
|
dec stepd |
|
cmp stepd, 0 |
|
jg %%loop_step |
|
|
|
%%bscale_scalar: |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
%macro HORIZ_SLICE 0 |
|
%if UNIX64 |
|
%if cpuflag(avx512) || cpuflag(avx2) |
|
cglobal horiz_slice, 5, 12, mmnum, 0-mmsize*4, buffer, width, height, steps, \ |
|
localbuf, x, y, step, stride, remain, ptr, mask |
|
%else |
|
cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step, stride, remain |
|
%endif |
|
%else |
|
%if cpuflag(avx512) || cpuflag(avx2) |
|
cglobal horiz_slice, 5, 12, mmnum, 0-mmsize*4, buffer, width, height, steps, nu, bscale, \ |
|
localbuf, x, y, step, stride, remain, ptr, mask |
|
%else |
|
cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y, step, stride, remain |
|
%endif |
|
%endif |
|
%if cpuflag(avx512) || cpuflag(avx2) |
|
%assign rows mmsize/4 |
|
%assign cols mmsize/4 |
|
%if WIN64 |
|
VBROADCASTSS m0, num |
|
VBROADCASTSS m1, bscalem |
|
|
|
mov nuq, localbufm |
|
DEFINE_ARGS buffer, width, height, steps, \ |
|
localbuf, x, y, step, stride, remain, ptr, mask |
|
%else |
|
VBROADCASTSS m0, xmm0 |
|
VBROADCASTSS m1, xmm1 |
|
%endif |
|
|
|
MOVSXDIFNIDN width, height, steps |
|
|
|
%if cpuflag(avx512) |
|
vpbroadcastd m2, widthd |
|
INIT_WORD_MASK k6, k5, k4, k3, k2, k1 |
|
INIT_INDICES m28, m29, m30 |
|
%else |
|
movd xm2, widthd |
|
VBROADCASTSS m2, xm2 |
|
%endif |
|
|
|
vpmulld m2, m2, [gblur_vindex_width] |
|
|
|
xor yq, yq |
|
xor xq, xq |
|
|
|
cmp heightq, rows |
|
jl .y_scalar |
|
sub heightq, rows |
|
|
|
.loop_y: |
|
|
|
mov ptrq, yq |
|
imul ptrq, widthq |
|
lea ptrq, [bufferq + ptrq*4] |
|
|
|
KXNOR m5, k7 |
|
VGATHERDPS m3, [ptrq + m2*4], m5, k7 |
|
mulps m3, m1 |
|
movu [localbufq], m3 |
|
add ptrq, 4 |
|
add localbufq, mmsize |
|
|
|
|
|
PUSH_MM m2 |
|
lea xq, [widthq - 1] |
|
.loop_x: |
|
PUSH_MM m3 |
|
GATHERPS cols |
|
TRANSPOSE |
|
POP_MM m3 |
|
WRITE_LOCAL_BUFFER |
|
|
|
add ptrq, mmsize |
|
add localbufq, rows * mmsize |
|
sub xq, cols |
|
cmp xq, cols |
|
jge .loop_x |
|
POP_MM m2 |
|
|
|
cmp xq, 0 |
|
jle .bscale_scalar |
|
.loop_x_scalar: |
|
KXNOR m5, k7 |
|
VGATHERDPS m4, [ptrq + m2*4], m5, k7 |
|
FMULADD_PS m3, m3, m0, m4, m3 |
|
movu [localbufq], m3 |
|
|
|
add ptrq, 0x4 |
|
add localbufq, mmsize |
|
dec xq |
|
cmp xq, 0 |
|
jg .loop_x_scalar |
|
|
|
.bscale_scalar: |
|
OPTIMIZED_LOOP_STEP |
|
sub ptrq, 4 |
|
sub localbufq, mmsize |
|
mulps m3, m1 |
|
KXNOR m5, k7, maskq |
|
VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq |
|
|
|
|
|
PUSH_MM m2 |
|
lea xq, [widthq - 1] |
|
.loop_x_back: |
|
sub localbufq, rows * mmsize |
|
READ_LOCAL_BUFFER cols |
|
PUSH_MM m2 |
|
TRANSPOSE |
|
POP_MM m3 |
|
sub ptrq, mmsize |
|
SCATTERPS |
|
|
|
sub xq, cols |
|
cmp xq, cols |
|
jge .loop_x_back |
|
POP_MM m2 |
|
|
|
cmp xq, 0 |
|
jle .end_loop_x |
|
.loop_x_back_scalar: |
|
sub ptrq, 0x4 |
|
sub localbufq, mmsize |
|
FMULADD_PS m3, m3, m0, [localbufq], m3 |
|
KXNOR m5, k7, maskq |
|
VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq |
|
|
|
dec xq |
|
cmp xq, 0 |
|
jg .loop_x_back_scalar |
|
|
|
.end_loop_x: |
|
|
|
add yq, rows |
|
cmp yq, heightq |
|
jle .loop_y |
|
|
|
add heightq, rows |
|
cmp yq, heightq |
|
jge .end_scalar |
|
|
|
mov remainq, widthq |
|
imul remainq, mmsize |
|
add ptrq, remainq |
|
|
|
.y_scalar: |
|
mov remainq, heightq |
|
sub remainq, yq |
|
mov maskq, 1 |
|
shlx maskq, maskq, remainq |
|
sub maskq, 1 |
|
mov remainq, maskq |
|
PUSH_MASK m5, k1, remaind, xd, rsp + 0x20 |
|
|
|
mov ptrq, yq |
|
imul ptrq, widthq |
|
lea ptrq, [bufferq + ptrq * 4] |
|
KMOVW m6, m5, k7, k1 |
|
VGATHERDPS m3, [ptrq + m2 * 4], m6, k7 |
|
mulps m3, m1 |
|
movu [localbufq], m3 |
|
add localbufq, mmsize |
|
|
|
|
|
lea xq, [widthq - 1] |
|
.y_scalar_loop_x: |
|
add ptrq, 4 |
|
KMOVW m6, m5, k7, k1 |
|
VGATHERDPS m4, [ptrq + m2 * 4], m6, k7 |
|
FMULADD_PS m3, m3, m0, m4, m3 |
|
movu [localbufq], m3 |
|
add localbufq, mmsize |
|
|
|
dec xq |
|
cmp xq, 0 |
|
jg .y_scalar_loop_x |
|
|
|
OPTIMIZED_LOOP_STEP |
|
|
|
sub localbufq, mmsize |
|
mulps m3, m1 |
|
KMOVW k7, k1 |
|
VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq, heightq |
|
|
|
|
|
lea xq, [widthq - 1] |
|
.y_scalar_loop_x_back: |
|
sub ptrq, 4 |
|
sub localbufq, mmsize |
|
FMULADD_PS m3, m3, m0, [localbufq], m3 |
|
KMOVW k7, k1 |
|
VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq, heightq |
|
dec xq |
|
cmp xq, 0 |
|
jg .y_scalar_loop_x_back |
|
|
|
.end_scalar: |
|
RET |
|
%else |
|
%if WIN64 |
|
movss m0, num |
|
movss m1, bscalem |
|
DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain |
|
%endif |
|
movsxdifnidn widthq, widthd |
|
|
|
mulss m2, m0, m0 |
|
mulss m3, m2, m0 |
|
mulss m4, m3, m0 |
|
xor xq, xq |
|
xor yd, yd |
|
mov strideq, widthq |
|
|
|
shl strideq, 2 |
|
|
|
mov remainq, widthq |
|
sub remainq, 1 |
|
and remainq, 3 |
|
sub widthq, remainq |
|
|
|
shufps m0, m0, 0 |
|
shufps m2, m2, 0 |
|
shufps m3, m3, 0 |
|
shufps m4, m4, 0 |
|
|
|
.loop_y: |
|
xor stepd, stepd |
|
|
|
.loop_step: |
|
|
|
mulss m5, m1, [ptrq + xq * 4] |
|
movss [ptrq + xq * 4], m5 |
|
inc xq |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.loop_x: |
|
movu m6, [ptrq + xq * 4] |
|
pslldq m7, m6, 4 |
|
movss m7, m5 |
|
FMULADD_PS m6, m7, m0, m6, m8 |
|
pslldq m7, 4 |
|
FMULADD_PS m6, m7, m2, m6, m8 |
|
pslldq m7, 4 |
|
FMULADD_PS m6, m7, m3, m6, m8 |
|
pslldq m7, 4 |
|
FMULADD_PS m6, m7, m4, m6, m8 |
|
movu [ptrq + xq * 4], m6 |
|
shufps m5, m6, m6, q3333 |
|
add xq, 4 |
|
cmp xq, widthq |
|
jl .loop_x |
|
|
|
add widthq, remainq |
|
cmp xq, widthq |
|
jge .end_scalar |
|
|
|
.loop_scalar: |
|
|
|
movss m5, [ptrq + 4*xq - 4] |
|
mulss m5, m0 |
|
addss m5, [ptrq + 4*xq] |
|
movss [ptrq + 4*xq], m5 |
|
inc xq |
|
cmp xq, widthq |
|
jl .loop_scalar |
|
.end_scalar: |
|
|
|
dec xq |
|
mulss m5, m1, [ptrq + 4*xq] |
|
movss [ptrq + 4*xq], m5 |
|
shufps m5, m5, 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.loop_x_back: |
|
sub xq, 4 |
|
movu m6, [ptrq + xq * 4] |
|
psrldq m7, m6, 4 |
|
blendps m7, m5, 0x8 |
|
FMULADD_PS m6, m7, m0, m6, m8 |
|
psrldq m7, 4 |
|
FMULADD_PS m6, m7, m2, m6, m8 |
|
psrldq m7, 4 |
|
FMULADD_PS m6, m7, m3, m6, m8 |
|
psrldq m7, 4 |
|
FMULADD_PS m6, m7, m4, m6, m8 |
|
movu [ptrq + xq * 4], m6 |
|
shufps m5, m6, m6, 0 |
|
cmp xq, remainq |
|
jg .loop_x_back |
|
|
|
cmp xq, 0 |
|
jle .end_scalar_back |
|
|
|
.loop_scalar_back: |
|
|
|
movss m5, [ptrq + 4*xq] |
|
mulss m5, m0 |
|
addss m5, [ptrq + 4*xq - 4] |
|
movss [ptrq + 4*xq - 4], m5 |
|
dec xq |
|
cmp xq, 0 |
|
jg .loop_scalar_back |
|
.end_scalar_back: |
|
|
|
|
|
sub widthq, remainq |
|
|
|
inc stepd |
|
cmp stepd, stepsd |
|
jl .loop_step |
|
|
|
add ptrq, strideq |
|
inc yd |
|
cmp yd, heightd |
|
jl .loop_y |
|
|
|
RET |
|
%endif |
|
%endmacro |
|
|
|
%if ARCH_X86_64 |
|
INIT_XMM sse4 |
|
HORIZ_SLICE |
|
|
|
%if HAVE_AVX2_EXTERNAL |
|
INIT_YMM avx2 |
|
%xdefine mmnum 16 |
|
HORIZ_SLICE |
|
%endif |
|
|
|
%if HAVE_AVX512_EXTERNAL |
|
INIT_ZMM avx512 |
|
%xdefine mmnum 32 |
|
HORIZ_SLICE |
|
%endif |
|
%endif |
|
|
|
%macro POSTSCALE_SLICE 0 |
|
cglobal postscale_slice, 2, 2, 4, ptr, length, postscale, min, max |
|
shl lengthd, 2 |
|
add ptrq, lengthq |
|
neg lengthq |
|
%if ARCH_X86_32 |
|
VBROADCASTSS m0, postscalem |
|
VBROADCASTSS m1, minm |
|
VBROADCASTSS m2, maxm |
|
%elif WIN64 |
|
VBROADCASTSS m0, xmm2 |
|
VBROADCASTSS m1, xmm3 |
|
VBROADCASTSS m2, maxm |
|
%else |
|
VBROADCASTSS m0, xmm0 |
|
VBROADCASTSS m1, xmm1 |
|
VBROADCASTSS m2, xmm2 |
|
%endif |
|
|
|
.loop: |
|
%if cpuflag(avx2) || cpuflag(avx512) |
|
mulps m3, m0, [ptrq + lengthq] |
|
%else |
|
movu m3, [ptrq + lengthq] |
|
mulps m3, m0 |
|
%endif |
|
maxps m3, m1 |
|
minps m3, m2 |
|
movu [ptrq+lengthq], m3 |
|
|
|
add lengthq, mmsize |
|
jl .loop |
|
|
|
RET |
|
%endmacro |
|
|
|
INIT_XMM sse |
|
POSTSCALE_SLICE |
|
|
|
%if HAVE_AVX2_EXTERNAL |
|
INIT_YMM avx2 |
|
POSTSCALE_SLICE |
|
%endif |
|
|
|
%if HAVE_AVX512_EXTERNAL |
|
INIT_ZMM avx512 |
|
POSTSCALE_SLICE |
|
%endif |
|
|
|
|
|
|
|
|
|
|
|
%macro VERTI_SLICE 0 |
|
%if UNIX64 |
|
cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \ |
|
steps, x, y, cwidth, step, ptr, stride |
|
%else |
|
cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \ |
|
steps, nu, bscale, x, y, cwidth, step, \ |
|
ptr, stride |
|
%endif |
|
%assign cols mmsize/4 |
|
%if WIN64 |
|
VBROADCASTSS m0, num |
|
VBROADCASTSS m1, bscalem |
|
DEFINE_ARGS buffer, width, height, cbegin, cend, \ |
|
steps, x, y, cwidth, step, ptr, stride |
|
%else |
|
VBROADCASTSS m0, xmm0 |
|
VBROADCASTSS m1, xmm1 |
|
%endif |
|
MOVSXDIFNIDN width, height, cbegin, cend, steps |
|
|
|
mov cwidthq, cendq |
|
sub cwidthq, cbeginq |
|
lea strideq, [widthq * 4] |
|
|
|
xor xq, xq |
|
cmp cwidthq, cols |
|
jl .x_scalar |
|
cmp cwidthq, 0x0 |
|
je .end_scalar |
|
|
|
sub cwidthq, cols |
|
.loop_x: |
|
xor stepq, stepq |
|
.loop_step: |
|
|
|
lea ptrq, [xq + cbeginq] |
|
lea ptrq, [bufferq + ptrq*4] |
|
|
|
|
|
movu m2, [ptrq] |
|
mulps m2, m1 |
|
movu [ptrq], m2 |
|
|
|
|
|
mov yq, 1 |
|
.loop_y_down: |
|
add ptrq, strideq |
|
movu m3, [ptrq] |
|
FMULADD_PS m2, m2, m0, m3, m2 |
|
movu [ptrq], m2 |
|
|
|
inc yq |
|
cmp yq, heightq |
|
jl .loop_y_down |
|
|
|
mulps m2, m1 |
|
movu [ptrq], m2 |
|
|
|
|
|
dec yq |
|
.loop_y_up: |
|
sub ptrq, strideq |
|
movu m3, [ptrq] |
|
FMULADD_PS m2, m2, m0, m3, m2 |
|
movu [ptrq], m2 |
|
|
|
dec yq |
|
cmp yq, 0 |
|
jg .loop_y_up |
|
|
|
inc stepq |
|
cmp stepq, stepsq |
|
jl .loop_step |
|
|
|
add xq, cols |
|
cmp xq, cwidthq |
|
jle .loop_x |
|
|
|
add cwidthq, cols |
|
cmp xq, cwidthq |
|
jge .end_scalar |
|
|
|
.x_scalar: |
|
xor stepq, stepq |
|
mov qword [rsp + 0x10], xq |
|
sub cwidthq, xq |
|
mov xq, 1 |
|
shlx cwidthq, xq, cwidthq |
|
sub cwidthq, 1 |
|
PUSH_MASK m4, k1, cwidthd, xd, rsp + 0x20 |
|
mov xq, qword [rsp + 0x10] |
|
|
|
.loop_step_scalar: |
|
lea ptrq, [xq + cbeginq] |
|
lea ptrq, [bufferq + ptrq*4] |
|
|
|
VMASKMOVPS m2, [ptrq], m4, k1 |
|
mulps m2, m1 |
|
VMASKMOVPS [ptrq], m2, m4, k1 |
|
|
|
|
|
mov yq, 1 |
|
.x_scalar_loop_y_down: |
|
add ptrq, strideq |
|
VMASKMOVPS m3, [ptrq], m4, k1 |
|
FMULADD_PS m2, m2, m0, m3, m2 |
|
VMASKMOVPS [ptrq], m2, m4, k1 |
|
|
|
inc yq |
|
cmp yq, heightq |
|
jl .x_scalar_loop_y_down |
|
|
|
mulps m2, m1 |
|
VMASKMOVPS [ptrq], m2, m4, k1 |
|
|
|
|
|
dec yq |
|
.x_scalar_loop_y_up: |
|
sub ptrq, strideq |
|
VMASKMOVPS m3, [ptrq], m4, k1 |
|
FMULADD_PS m2, m2, m0, m3, m2 |
|
VMASKMOVPS [ptrq], m2, m4, k1 |
|
|
|
dec yq |
|
cmp yq, 0 |
|
jg .x_scalar_loop_y_up |
|
|
|
inc stepq |
|
cmp stepq, stepsq |
|
jl .loop_step_scalar |
|
|
|
.end_scalar: |
|
RET |
|
%endmacro |
|
|
|
%if ARCH_X86_64 |
|
%if HAVE_AVX2_EXTERNAL |
|
INIT_YMM avx2 |
|
VERTI_SLICE |
|
%endif |
|
|
|
%if HAVE_AVX512_EXTERNAL |
|
INIT_ZMM avx512 |
|
VERTI_SLICE |
|
%endif |
|
%endif |
|
|